In [28]:
import spacy
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import re

In [2]:
def get_data(path):
    for text_file in path.glob("*.txt"):
        with open(text_file, encoding="utf8") as file:
            yield file.read()
            
path_train_pos = Path('train/pos')
path_train_neg = Path('train/neg')

path_test_pos = Path('test/pos')
path_test_neg = Path('test/neg')


df_train_pos = pd.DataFrame(list((get_data(path_train_pos))), columns=['text'])
df_train_neg = pd.DataFrame(list((get_data(path_train_neg))), columns=['text'])

df_test_pos = pd.DataFrame(list((get_data(path_test_pos))), columns=['text'])
df_test_neg = pd.DataFrame(list((get_data(path_test_neg))), columns=['text'])


df_train_pos['target'] = 1
df_train_neg['target'] = 0
df_test_pos['target'] = 1
df_test_neg['target'] = 0

In [26]:
def drop_numbers(text):
    return [
        word for word in text
        if not re.search('[0-9_]', word)]

In [63]:
train = pd.concat([df_train_pos.sample(500), df_train_neg.sample(500)], ignore_index=True)
test = pd.concat([df_test_pos.sample(500), df_test_neg.sample(500)], ignore_index=True)

In [36]:
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser"])

In [None]:
#vectorizer = TfidfVectorizer()
#tf_idf = vectorizer.fit_transform(texts.text)
# Сжатие мешка в tfidf
#svd = TruncatedSVD(n_components=200, n_iter=5)
#tf_idf_svd = svd.fit_transform(tf_idf)

In [64]:
train['text'] = train['text'].str.replace(r'[\d]', ' ',regex=True).str.lower()

train['text'] = train['text'].map(lambda x: nlp(x))

train['text'] = train['text'].map(
    lambda doc:nlp(
        ' '.join(
            [notStopWords.text for notStopWords in doc if not notStopWords.is_stop]
        )
    )
)

train['text'] = train['text'].map(
    lambda doc: nlp(
        ' '.join(
            [token.orth_ for token in doc if not token.is_punct | token.is_space]
        )
    )
)

train['text'] = train['text'].map(
    lambda doc:' '.join([token.lemma_ for token in doc]
        )
    )

train

Unnamed: 0,text,target
0,funny sad charm word float head watch beautifu...,1
1,pialat film people extreme emotional situation...,1
2,see film year knowledge little rusty remember ...,1
3,great funny sweet movie morgan freeman play me...,1
4,run film rend pick freebird struggle movie won...,1
...,...,...
995,sadly true legiunea sträinä expose absolute cl...,0
996,film star slapchop vince offer write edit dire...,0
997,good storyline good thrill second destroy movi...,0
998,unlike find premise theme bite offensive execu...,0


In [65]:
test['text'] = test['text'].str.replace(r'[\d]', ' ',regex=True).str.lower()

test['text'] = test['text'].map(lambda x: nlp(x))

test['text'] = test['text'].map(
    lambda doc:nlp(
        ' '.join(
            [notStopWords.text for notStopWords in doc if not notStopWords.is_stop]
        )
    )
)

test['text'] = test['text'].map(
    lambda doc: nlp(
        ' '.join(
            [token.orth_ for token in doc if not token.is_punct | token.is_space]
        )
    )
)

test['text'] = test['text'].map(
    lambda doc: ' '.join([token.lemma_ for token in doc]
        )
)

test

Unnamed: 0,text,target
0,stan laurel regard putt pant philip true l&h ...,1
1,mother go wind collector plate rhett scarlett ...,1
2,interest people know film recourse phoolan dev...,1
3,realize people comment nature racist homophobi...,1
4,meester sharky look normal table fancy cocktai...,1
...,...,...
995,oh bad clothe wrong synth music wrong david ha...,0
996,question wrong screen adaptation stephen king ...,0
997,husband want watch film review paper say well ...,0
998,james stewart play johnny mason lawyer carole ...,0


In [47]:
train['text'] = train['text'].map(lambda doc: ' '.join(drop_numbers(doc.split(' '))))

In [48]:
test['text'] = test['text'].map(lambda doc: ' '.join(drop_numbers(doc.split(' '))))

In [66]:
cv = CountVectorizer()
bag_of_words_train = cv.fit_transform(train['text'])
bag_of_words_test = cv.transform(test['text'])

In [67]:
pd.DataFrame(
    data=bag_of_words_train.toarray(),
    columns=cv.get_feature_names()

)

Unnamed: 0,10,11,13,16,19,aaargh,aaron,aarp,abandon,abbot,...,zone,zoo,zoom,zoot,zorba,zoé,zschering,zuckerman,zulu,zzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
params = {'C': np.logspace(-5, 5, 11)}
clf = LogisticRegression()
cv = GridSearchCV(clf, params, n_jobs=-1, scoring='roc_auc',cv=5)
cv.fit(bag_of_words_train, train['target'])



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': array([1.e-05, 1.e-04, 1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02,
       1.e+03, 1.e+04, 1.e+05])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='roc_auc', verbose=0)

In [69]:
pd.DataFrame(cv.cv_results_)[['mean_test_score', 'params']].sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_test_score,params
4,0.86966,{'C': 0.1}
5,0.86784,{'C': 1.0}
6,0.86632,{'C': 10.0}
7,0.8643,{'C': 100.0}
8,0.8603,{'C': 1000.0}
9,0.85576,{'C': 10000.0}
3,0.85562,{'C': 0.01}
10,0.8545,{'C': 100000.0}
2,0.80434,{'C': 0.001}
1,0.71944,{'C': 0.0001}


In [70]:
X, y = bag_of_words_train, train['target']
X_test, y_test = bag_of_words_test, test['target']
clf = LogisticRegression(C=0.1, random_state=0).fit(X, y)

In [71]:
clf.predict(X_test)

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,

In [72]:
clf.predict_proba(X_test)
clf.score(X_test, y_test)

0.814

In [53]:
vectorizer = TfidfVectorizer()
tf_idf_train = vectorizer.fit_transform(train['text'])
tf_idf_test = vectorizer.transform(test['text'])
#Сжатие мешка в tfidf
#svd = TruncatedSVD(n_components=200, n_iter=5)
#tf_idf_svd = svd.fit_transform(train['text'])

X, y = tf_idf_train, train['target']
X_test, y_test = tf_idf_test, test['target']
clf = LogisticRegression(C=0.1, penalty='l2', random_state=0).fit(X, y)



In [54]:
clf.predict(X_test)

array([1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [55]:
clf.predict_proba(X_test)
clf.score(X_test, y_test)

0.686

In [29]:
train['text'] = train['text'].apply(drop_numbers)