In [1]:
import pandas as pd
import numpy as np
import codecs
import spacy
import re
import nltk
import joblib
import random

#from spacy.training.example import Example

from collections import defaultdict
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords

#from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

In [2]:
def df_upload (df, path):
    results = defaultdict(list)
    for file in Path(path).iterdir():
        with codecs.open(file, "r", "utf-8") as file_open:
            results["file_name"].append(file.name)
            results["text"].append(file_open.read())
        df = pd.DataFrame(results)
    return df

In [3]:
df_test_neg, df_test_pos, df_train_neg, df_train_pos = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
df_test_neg = df_upload (df_test_neg, "D:/Хаклаб/ML разработка/aclImdb/test/neg")
df_test_pos = df_upload (df_test_pos, "D:/Хаклаб/ML разработка/aclImdb/test/pos")
df_train_neg = df_upload (df_train_neg, "D:/Хаклаб/ML разработка/aclImdb/train/neg")
df_train_pos = df_upload (df_train_pos, "D:/Хаклаб/ML разработка/aclImdb/train/pos")

In [4]:
df_test_neg['is_pos'], df_train_neg['is_pos'] = 0, 0
df_test_pos['is_pos'], df_train_pos['is_pos'] = 1, 1

In [5]:
print (df_test_neg.head())
print (df_test_pos.head())
print (df_train_neg.head())
print (df_train_pos.head())

     file_name                                               text  is_pos
0      0_2.txt  Once again Mr. Costner has dragged out a movie...       0
1  10000_4.txt  This is an example of why the majority of acti...       0
2  10001_1.txt  First of all I hate those moronic rappers, who...       0
3  10002_3.txt  Not even the Beatles could write songs everyon...       0
4  10003_3.txt  Brass pictures (movies is not a fitting word f...       0
     file_name                                               text  is_pos
0     0_10.txt  I went and saw this movie last night after bei...       1
1  10000_7.txt  Actor turned director Bill Paxton follows up h...       1
2  10001_9.txt  As a recreational golfer with some knowledge o...       1
3  10002_8.txt  I saw this film in a sneak preview, and it is ...       1
4  10003_8.txt  Bill Paxton has taken the true story of the 19...       1
     file_name                                               text  is_pos
0      0_3.txt  Story of a man who has

In [43]:
df_train = pd.DataFrame()
df_test = pd.DataFrame()

df_train = pd.concat ([df_train_neg, df_train_pos], sort=False, axis=0)
df_test = pd.concat ([df_test_neg, df_test_pos], sort=False, axis=0)

df_train = df_train.sample(frac=1).reset_index(drop=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)

print(df_train.head(), df_test.head())

     file_name                                               text  is_pos
0   3195_3.txt  Two qualifiers right up front: I actually thin...       0
1   2942_1.txt  This is hands down the worst movie of all time...       0
2   5812_4.txt  "This story is dedicated to women," according ...       0
3   2236_9.txt  The thing I remember most about this film is t...       1
4  11196_7.txt  'One-Round' Jack Sander is called that because...       1     file_name                                               text  is_pos
0  7511_7.txt  this movie is funny funny funny my favorite qu...       1
1  4662_1.txt  Being an unrelenting non-stop over-the-top exp...       0
2  6112_8.txt  I was first introduced to John Waters films by...       1
3  6983_4.txt  Yeah, I know his character was supposed to be ...       0
4  8941_3.txt  As much as I love trains, I couldn't stomach t...       0


Таким образом мы подготовили датафреймы для обучения и теста моделей. Далее необходимо привести слова к леммам.

In [7]:
df_train.info

<bound method DataFrame.info of          file_name                                               text  is_pos
0       1348_3.txt  this animated Inspector Gadget movie is pretty...       0
1      11523_7.txt  This is one of the funniest movies I have seen...       1
2       8569_9.txt  This is just as good as the original 101 if no...       1
3       1675_9.txt  Sergio Martino has impressed me recently with ...       1
4      11256_1.txt  What a waste of time! I've tried to sit throug...       0
...            ...                                                ...     ...
24995   1639_1.txt  "Whipped" is 82 minutes long. This review is 8...       0
24996  7740_10.txt  It is rare that one comes across a movie as fl...       1
24997   4090_3.txt  Shlock-merchant Leo Fulci takes a change of pa...       0
24998   4746_8.txt  This film appears to draw a borderline - on on...       1
24999  1893_10.txt  Although it's most certainly politically incor...       1

[25000 rows x 3 columns]>

In [8]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])



In [9]:
def lemmatize_text(text):    
    text = text.lower()
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

In [44]:
%%time
df_train['text'] = df_train['text'].apply(lemmatize_text)
df_test['text'] = df_test['text'].apply(lemmatize_text)

CPU times: total: 9min 54s
Wall time: 9min 55s


Лемматизация проведена. Далее обозначим признаки.

In [45]:
target_train = df_train['is_pos']
features_train = df_train.drop(['file_name', 'is_pos'], axis=1)

target_test = df_test['is_pos']
features_test = df_test.drop(['file_name', 'is_pos'], axis=1)

Создадим матрицу TFIDF и применим TfidfVectorizer

In [12]:
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zmeis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [46]:
count_tf_idf = TfidfVectorizer(stop_words=stopwords)
vect = count_tf_idf.fit(features_train['text'].values) #learn vectorizer on train features

In [47]:
features_train = vect.transform(features_train['text'].values)
features_test = vect.transform(features_test['text'].values)

Теперь можно пробовать подбирать модели машинного обучения. Начнем с логистической регрессии.

In [15]:
log_reg = LogisticRegression(class_weight='balanced').fit(features_train, target_train)

In [16]:
print(f1_score(target_test, log_reg.predict(features_test)))

0.8779222851671586


Рандомный лес.

In [17]:
parametrs = { 'n_estimators': range (10, 51, 10),
              'max_depth': range (1,10, 2)}

In [18]:
rfc = RandomForestClassifier(random_state=12345)

In [19]:
grid = GridSearchCV(rfc, parametrs, cv=5, n_jobs=-1, scoring='f1')
grid.fit(features_train, target_train)
grid.best_params_

{'max_depth': 9, 'n_estimators': 50}

In [20]:
rfc = RandomForestClassifier(random_state=12345, n_estimators=50, max_depth=9, class_weight='balanced').fit(features_train, target_train)

In [21]:
print(f1_score(target_test, rfc.predict(features_test)))

0.8039345282065239


Дерево решений

In [22]:
parametrs = { 'penalty': ['l2', 'l1'],
              'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'power_t': [-1.0, -0.5, 0.0, 0.5, 1]}

In [23]:
sgdc = SGDClassifier(random_state=12345, class_weight='balanced')

In [24]:
grid = GridSearchCV(sgdc, parametrs, cv=5, n_jobs=-1, scoring='f1')
grid.fit(features_train, target_train)
grid.best_params_

{'alpha': 0.0001, 'penalty': 'l2', 'power_t': -1.0}

In [27]:
sgdc = SGDClassifier(random_state=12345, alpha = 0.0001, penalty = 'l2', power_t = -1.0, class_weight='balanced').fit(features_train, target_train)

In [28]:
print(f1_score(target_test, sgdc.predict(features_test)))

0.8802388059701493


По полученным результатам логистичекая регрессия показал лучший результат по метрике f1_score = 0.878. Будем использовать эту модель машинного обучения

In [29]:
filename= 'sgdc_model.sav'
saved_model=joblib.dump(sgdc,filename)

И сохраним обученный TfidfVectorizer для дальнейшего использования.

In [48]:
test_review = """This is a pale imitation of 'Officer and a Gentleman.' There is NO chemistry between Kutcher and the unknown woman who plays his love interest. The dialog is wooden, the situations hackneyed. It's too long and the climax is anti-climactic(!). I love the USCG, its men and women are fearless and tough. The action scenes are awesome, but this movie doesn't do much for recruiting, I fear. The script is formulaic, but confusing. Kutcher's character is trying to redeem himself for an accident that wasn't his fault? Costner's is raging against the dying of the light, but why? His 'conflict' with his wife is about as deep as a mud puddle. I saw this sneak preview for free and certainly felt I got my money's worth.
"""

In [49]:
result = sgdc.predict(vect.transform([test_review]))
print(result)

[0]


In [51]:
filename2= 'vectorizer.sav'
saved_vect=joblib.dump(vect,filename2)