In [1]:
import re
import pymorphy2
from nltk.corpus import stopwords
from pandas import read_csv, DataFrame
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.svm import SVR
import matplotlib.pyplot as plt

morph = pymorphy2.MorphAnalyzer()
stops = set(stopwords.words("english")) | set(stopwords.words("russian"))

def normalize(text):
    new_text = re.sub("[^а-яА-Яa-zA-Z]", " ", text)
    # lower case
    words = new_text.lower().split()
    # remove stop words
    words = [w for w in words if not w in stops]
    # normal form
    words = [morph.parse(w)[0].normal_form for w in words]
    return (words)

def get_name(reg_model):
     name = str(reg_model.named_steps['Reg'])
     name = name[:name.index('(')]
     return name
    
def show_most_informative_features(reg_model):    
    if(get_name(reg_model) == 'LinearRegression'):
        feature_names = reg_model.named_steps['union'].transformer_list[0][1].named_steps['count_vect'].get_feature_names()
        scaler = StandardScaler()
        coefs_help = reg_model.named_steps['Reg'].coef_
        scaler.fit(coefs_help)
        blabla = scaler.transform(coefs_help)
        feature_names.append('coms')
        feature_names.append('favs')
        feature_names.append('size')
        coefs_with_fns = sorted(zip(blabla, feature_names), reverse=True)
        print(DataFrame(coefs_with_fns[:10],columns = ['feature', 'coefs']))

def reg_methods(reg_model): 
    model = {}
    model['name'] = get_name(reg_model)
    X = data.drop(['likes'],axis=1)
    Y = data['likes']
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.4)
    reg_model.fit(x_train,y_train)        
    model['score'] = reg_model.score(x_test,y_test)
    show_most_informative_features(reg_model)
    return model

    

def paint_reg_methods(test_models): # Построение графика
        fig, axes = plt.subplots(figsize=(16, 4))
        test_models.score.plot(kind='barh', title='Regression Methods', fontsize=10, stacked=True)
        fig.savefig('result.png')

dataset = read_csv('test.tsv',sep ='\t',error_bad_lines=False)
data = DataFrame(dataset, columns=['likes','coms','favs','size','text'])

In [None]:
print(dataset.corr())


In [2]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_frame):
        return data_frame[[self.key]]   
class Converter(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, data_frame):
        return data_frame.values.ravel()
    
lin_reg = Pipeline([('union', FeatureUnion(
                transformer_list=[('text_vect', Pipeline([('selector', ItemSelector(key='text')),
                                                            ('converter', Converter()),
                                                          ('count_vect', TfidfVectorizer(tokenizer=normalize,stop_words = stops))])),
                                 ('coms_sc', Pipeline([('selector', ItemSelector(key='coms')),
                                                           ('std_scaler',  StandardScaler())])),
                                 ('favs_sc',Pipeline([('selector', ItemSelector(key='favs')),
                                                           ('std_scaler',  StandardScaler())])),
                                 ('size_sc',Pipeline([('selector', ItemSelector(key='size')),
                                                           ('std_scaler',  StandardScaler())]))],
                transformer_weights={'text_vect': 1.0,'coms_sc': 1.0,'favs_sc': 1.0,'size_sc': 1.0 })),

                          ('Reg', LinearRegression())])
rand_fores_reg = Pipeline([('union', FeatureUnion(
                transformer_list=[('text_vect', Pipeline([('selector', ItemSelector(key='text')),
                                                            ('converter', Converter()),
                                                          ('count_vect', TfidfVectorizer(tokenizer=normalize,stop_words = stops))])),
                                 ('coms_sc', Pipeline([('selector', ItemSelector(key='coms')),
                                                           ('std_scaler',  StandardScaler())])),
                                 ('favs_sc',Pipeline([('selector', ItemSelector(key='favs')),
                                                           ('std_scaler',  StandardScaler())])),
                                 ('size_sc',Pipeline([('selector', ItemSelector(key='size')),
                                                           ('std_scaler',  StandardScaler())]))],
                transformer_weights={'text_vect': 1.0,'coms_sc': 1.0,'favs_sc': 1.0,'size_sc': 1.0 })),
                          ('Reg', RandomForestRegressor(n_estimators=10, max_features='sqrt'))])
kneigh_reg = Pipeline([('union', FeatureUnion(
                transformer_list=[('text_vect', Pipeline([('selector', ItemSelector(key='text')),
                                                            ('converter', Converter()),
                                                          ('count_vect', TfidfVectorizer(tokenizer=normalize,stop_words = stops))])),
                                 ('coms_sc', Pipeline([('selector', ItemSelector(key='coms')),
                                                           ('std_scaler',  StandardScaler())])),
                                 ('favs_sc',Pipeline([('selector', ItemSelector(key='favs')),
                                                           ('std_scaler',  StandardScaler())])),
                                 ('size_sc',Pipeline([('selector', ItemSelector(key='size')),
                                                           ('std_scaler',  StandardScaler())]))],
                transformer_weights={'text_vect': 1.0,'coms_sc': 1.0,'favs_sc': 1.0,'size_sc': 1.0 })),
                          ('Reg', KNeighborsRegressor(n_neighbors=6))])
svr_reg = Pipeline([('union', FeatureUnion(
                transformer_list=[('text_vect', Pipeline([('selector', ItemSelector(key='text')),
                                                            ('converter', Converter()),
                                                          ('count_vect', TfidfVectorizer(tokenizer=normalize,stop_words = stops))])),
                                 ('coms_sc', Pipeline([('selector', ItemSelector(key='coms')),
                                                           ('std_scaler',  StandardScaler())])),
                                 ('favs_sc',Pipeline([('selector', ItemSelector(key='favs')),
                                                           ('std_scaler',  StandardScaler())])),
                                 ('size_sc',Pipeline([('selector', ItemSelector(key='size')),
                                                           ('std_scaler',  StandardScaler())]))],
                transformer_weights={'text_vect': 1.0,'coms_sc': 1.0,'favs_sc': 1.0,'size_sc': 1.0 })),
                          ('Reg',SVR(kernel='linear'))])



test_models = DataFrame()

In [3]:
test_models = test_models.append([reg_methods(lin_reg)],sort=True)
#bla = test_models.named_steps['Reg'].coef_[:-3]

ValueError: Expected 2D array, got 1D array instead:
array=[12.7955247  -2.88085386  6.85987867 ... 32.12323144 39.90287634
 -2.29109495].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
test_models = test_models.append([reg_methods(rand_fores_reg)],sort=True)

In [None]:
test_models = test_models.append([reg_methods(kneigh_reg)],sort=True)
test_models = test_models.append([reg_methods(svr_reg)],sort=True)
test_models.set_index('name',inplace=True)
paint_reg_methods(test_models)

