# Открывем исходники

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

pd.set_option('display.max_colwidth', None)
tqdm.pandas()

In [2]:
with open('train.labels', 'r') as file1:
    labels = file1.read()

labels = labels.split('\n')
labels = [_ for _ in labels if _]

In [3]:
# сделаем нормальные лейблы
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
binary_labels = le.fit_transform(labels)

In [4]:
with open('train.texts', 'r') as file2:
    train_texts = file2.read()

train_texts = train_texts.split('\n')[:-1]

In [5]:
imdb_data = pd.DataFrame(data={
    'review': train_texts,
    'labels': binary_labels
})

In [6]:
imdb_data.to_csv('imdb_data.csv', index=False)

In [6]:
imdb_data.head(3)

Unnamed: 0,review,labels
0,"If the myth regarding broken mirrors would be accurate, everybody involved in this production would now face approximately 170 years of bad luck, because there are a lot of mirrors falling to little pieces here. If only the script was as shattering as the glass, then ""The Broken"" would have been a brilliant film. Now it's sadly just an overlong, derivative and dull movie with only just a handful of remarkable ideas and memorable sequences. Sean Ellis made a very stylish and elegantly photographed movie, but the story is lackluster and the total absence of logic and explanation is really frustrating. I got into a discussion with a friend regarding the basic concept and ""meaning"" of the film. He thinks Ellis found inspiration in an old legend claiming that spotting your doppelganger is a foreboding of how you're going to die. Interesting theory, but I'm not familiar with this legend and couldn't find anything on the Internet about this, neither. Personally, I just think ""The Broken"" is yet another umpteenth variation on the theme of ""Invasion of the Body Snatchers"" but without the alien interference. ""The Broken"" centers on the American McVey family living in London, and particularly Gina. When a mirror spontaneously breaks during a birthday celebration, this triggers a whole series of mysterious and seemingly supernatural events. Gina spots herself driving by in a car and follows her mirror image to an apartment building. Whilst driving home in a state of mental confusion, she causes a terrible car accident and ends up in the hospital. When dismissed, Gina feels like her whole surrounding is changing. She doesn't recognize her own boyfriend anymore and uncanny fragments of the accident keep flashing before her eyes. Does she suffer from mental traumas invoked by the accident or is there really a supernatural conspiracy happening all around her? Writer/director Sean Ellis definitely invokes feelings of curiosity and suspense in his script, but unfortunately he fails to properly elaborate them. ""The Broken"" is a truly atmospheric and stylish effort, but only after just half an hour of film, you come to the painful conclusion it shall just remain a beautiful but empty package. There's a frustratingly high amount of ""fake"" suspense in this film. This means building up tension, through ominous music and eerie camera angels, when absolutely nothing has even happened so far. By the time the actually mysteriousness kicks in, these tricks don't have any scary effect on you anymore. Some of my fellow reviewers around here compare the film and particularly Sean Ellis' style with the repertoires of David Lynch, Stanley Kubrick and even Alfred Hitchcock, but that is way, way WAY too much honor. PS: what is up with that alternate spelling; the one with the Scandinavian ""ø""",0
1,I gave this movie a 10 because it needed to be rewarded for its scary elements and actors AND my god the enging! The thing is I don't want to tell anyone anything about the acting or story because it will ruin the movie. But I will recommend that you go straight to your nearest moviestore right now and rent it! (Don't forget popcorn!),1
2,"After watching the first 20mn of Blanche(sorry I couldn't take more of it), I have now confirmed she does not. <br /><br />Basically, this ""movie"" is an insult to the real french actors participating in this farcical piece of junk. It starts from a concept successfully used in French comedies (""Deux heures moins le quart avant Jesus Christ"", ""La Folie des Grandeurs"",...): a historical movie with anachronic tone / dialogues. This can give brilliant results if supported by brilliant actors and a ""finesse"" of direction avoiding the dreaded ""heavy comedy"" stigma.<br /><br />Unfortunately, the horsey-faced Lou Doillon ruins everything and Blanche, instead of a comedy, just turns into an horror movie. Horror to cinephiles who want to be puzzled and shocked watching fine actors such as Decaune, Zem or Rochefort struggling in the middle of this gaudy burlesque kitchy-prissy farce.",0


# Предобработка текстов

In [None]:
import nltk
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

SW = stopwords.words("english")


def primary_processing(text):
    # приведем все к нижнему регистру и удалим все не английские символы
    text = text.replace('<br /><br />', ' ')
    cleaned_text = re.sub(r'[^a-zA-Z\s]', ' ', text).lower()

    return ' '.join(cleaned_text.split())


def tokenize(text):
    tokens = nltk.word_tokenize(text)
    return tokens


def lemmatizer(tokens):
    lem = WordNetLemmatizer()
    tokens = [lem.lemmatize(token) for token in tokens]
    return tokens


def simple_stemmer(tokens):
    ps=PorterStemmer()
    tokens = [ps.stem(token) for token in tokens]
    return tokens


def delete_stopwords(tokens):
    text_without_SW = [token for token in tokens if token not in SW]

    return text_without_SW


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mikesu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mikesu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mikesu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
imdb_data['review'] = imdb_data['review'].apply(primary_processing)

In [9]:
imdb_data['review'] = imdb_data['review'].progress_apply(tokenize)

100%|██████████| 15000/15000 [00:15<00:00, 983.76it/s] 


In [10]:
imdb_data['lem']=imdb_data['review'].progress_apply(lemmatizer)

100%|██████████| 15000/15000 [00:32<00:00, 467.21it/s]


In [11]:
imdb_data['stem']=imdb_data['review'].progress_apply(simple_stemmer)

100%|██████████| 15000/15000 [01:20<00:00, 185.86it/s]


In [12]:
imdb_data['review']=imdb_data['review'].progress_apply(delete_stopwords)
imdb_data['stem']=imdb_data['stem'].progress_apply(delete_stopwords)
imdb_data['lem']=imdb_data['lem'].progress_apply(delete_stopwords)

  0%|          | 0/15000 [00:00<?, ?it/s]

100%|██████████| 15000/15000 [00:09<00:00, 1633.96it/s]
100%|██████████| 15000/15000 [00:09<00:00, 1570.98it/s]
100%|██████████| 15000/15000 [00:08<00:00, 1688.02it/s]


In [13]:
imdb_data.head(1)

Unnamed: 0,review,labels,lem,stem
0,"[myth, regarding, broken, mirrors, would, accurate, everybody, involved, production, would, face, approximately, years, bad, luck, lot, mirrors, falling, little, pieces, script, shattering, glass, broken, would, brilliant, film, sadly, overlong, derivative, dull, movie, handful, remarkable, ideas, memorable, sequences, sean, ellis, made, stylish, elegantly, photographed, movie, story, lackluster, total, absence, logic, explanation, really, frustrating, got, discussion, friend, regarding, basic, concept, meaning, film, thinks, ellis, found, inspiration, old, legend, claiming, spotting, doppelganger, foreboding, going, die, interesting, theory, familiar, legend, find, anything, internet, neither, personally, think, broken, yet, another, umpteenth, variation, theme, invasion, body, snatchers, without, alien, interference, broken, centers, american, mcvey, family, living, ...]",0,"[myth, regarding, broken, mirror, would, accurate, everybody, involved, production, would, face, approximately, year, bad, luck, lot, mirror, falling, little, piece, script, wa, shattering, glass, broken, would, brilliant, film, sadly, overlong, derivative, dull, movie, handful, remarkable, idea, memorable, sequence, sean, elli, made, stylish, elegantly, photographed, movie, story, lackluster, total, absence, logic, explanation, really, frustrating, got, discussion, friend, regarding, basic, concept, meaning, film, think, elli, found, inspiration, old, legend, claiming, spotting, doppelganger, foreboding, going, die, interesting, theory, familiar, legend, find, anything, internet, neither, personally, think, broken, yet, another, umpteenth, variation, theme, invasion, body, snatcher, without, alien, interference, broken, center, american, mcvey, family, ...]","[myth, regard, broken, mirror, would, accur, everybodi, involv, thi, product, would, face, approxim, year, bad, luck, becaus, lot, mirror, fall, littl, piec, onli, script, wa, shatter, glass, broken, would, brilliant, film, sadli, overlong, deriv, dull, movi, onli, hand, remark, idea, memor, sequenc, sean, elli, made, veri, stylish, elegantli, photograph, movi, stori, lacklust, total, absenc, logic, explan, realli, frustrat, got, discuss, friend, regard, basic, concept, mean, film, think, elli, found, inspir, old, legend, claim, spot, doppelgang, forebod, go, die, interest, theori, familiar, thi, legend, find, anyth, internet, thi, neither, person, think, broken, yet, anoth, umpteenth, variat, theme, invas, bodi, snatcher, without, ...]"


# Подготовка к обучению

In [14]:
imdb_data_cleared = imdb_data.copy()

In [15]:
imdb_data_cleared['review'] = imdb_data_cleared['review'].apply(lambda token: " ".join(token))
imdb_data_cleared['stem'] = imdb_data_cleared['stem'].apply(lambda token: " ".join(token))
imdb_data_cleared['lem'] = imdb_data_cleared['lem'].apply(lambda token: " ".join(token))

In [16]:
imdb_data_cleared.head(1)

Unnamed: 0,review,labels,lem,stem
0,myth regarding broken mirrors would accurate everybody involved production would face approximately years bad luck lot mirrors falling little pieces script shattering glass broken would brilliant film sadly overlong derivative dull movie handful remarkable ideas memorable sequences sean ellis made stylish elegantly photographed movie story lackluster total absence logic explanation really frustrating got discussion friend regarding basic concept meaning film thinks ellis found inspiration old legend claiming spotting doppelganger foreboding going die interesting theory familiar legend find anything internet neither personally think broken yet another umpteenth variation theme invasion body snatchers without alien interference broken centers american mcvey family living london particularly gina mirror spontaneously breaks birthday celebration triggers whole series mysterious seemingly supernatural events gina spots driving car follows mirror image apartment building whilst driving home state mental confusion causes terrible car accident ends hospital dismissed gina feels like whole surrounding changing recognize boyfriend anymore uncanny fragments accident keep flashing eyes suffer mental traumas invoked accident really supernatural conspiracy happening around writer director sean ellis definitely invokes feelings curiosity suspense script unfortunately fails properly elaborate broken truly atmospheric stylish effort half hour film come painful conclusion shall remain beautiful empty package frustratingly high amount fake suspense film means building tension ominous music eerie camera angels absolutely nothing even happened far time actually mysteriousness kicks tricks scary effect anymore fellow reviewers around compare film particularly sean ellis style repertoires david lynch stanley kubrick even alfred hitchcock way way way much honor ps alternate spelling one scandinavian,0,myth regarding broken mirror would accurate everybody involved production would face approximately year bad luck lot mirror falling little piece script wa shattering glass broken would brilliant film sadly overlong derivative dull movie handful remarkable idea memorable sequence sean elli made stylish elegantly photographed movie story lackluster total absence logic explanation really frustrating got discussion friend regarding basic concept meaning film think elli found inspiration old legend claiming spotting doppelganger foreboding going die interesting theory familiar legend find anything internet neither personally think broken yet another umpteenth variation theme invasion body snatcher without alien interference broken center american mcvey family living london particularly gina mirror spontaneously break birthday celebration trigger whole series mysterious seemingly supernatural event gina spot driving car follows mirror image apartment building whilst driving home state mental confusion cause terrible car accident end hospital dismissed gina feel like whole surrounding changing recognize boyfriend anymore uncanny fragment accident keep flashing eye doe suffer mental trauma invoked accident really supernatural conspiracy happening around writer director sean elli definitely invokes feeling curiosity suspense script unfortunately fails properly elaborate broken truly atmospheric stylish effort half hour film come painful conclusion shall remain beautiful empty package frustratingly high amount fake suspense film mean building tension ominous music eerie camera angel absolutely nothing ha even happened far time actually mysteriousness kick trick scary effect anymore fellow reviewer around compare film particularly sean elli style repertoire david lynch stanley kubrick even alfred hitchcock way way way much honor p alternate spelling one scandinavian,myth regard broken mirror would accur everybodi involv thi product would face approxim year bad luck becaus lot mirror fall littl piec onli script wa shatter glass broken would brilliant film sadli overlong deriv dull movi onli hand remark idea memor sequenc sean elli made veri stylish elegantli photograph movi stori lacklust total absenc logic explan realli frustrat got discuss friend regard basic concept mean film think elli found inspir old legend claim spot doppelgang forebod go die interest theori familiar thi legend find anyth internet thi neither person think broken yet anoth umpteenth variat theme invas bodi snatcher without alien interfer broken center american mcvey famili live london particularli gina mirror spontan break dure birthday celebr thi trigger whole seri mysteri seemingli supernatur event gina spot drive car follow mirror imag apart build whilst drive home state mental confus caus terribl car accid end hospit dismiss gina feel like whole surround chang recogn boyfriend anymor uncanni fragment accid keep flash befor eye doe suffer mental trauma invok accid realli supernatur conspiraci happen around writer director sean elli definit invok feel curios suspens hi script unfortun fail properli elabor broken truli atmospher stylish effort onli half hour film come pain conclus shall remain beauti empti packag frustratingli high amount fake suspens thi film thi mean build tension omin music eeri camera angel absolut noth ha even happen far time actual mysteri kick trick ani scari effect anymor fellow review around compar film particularli sean elli style repertoir david lynch stanley kubrick even alfr hitchcock way way way much honor ps altern spell one scandinavian


In [17]:
from sklearn.model_selection import train_test_split
X = imdb_data_cleared['review']
y = imdb_data_cleared['labels']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

### Лучшее решение (LogisticRegression)

In [22]:
"""
Пайплайн
"""

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    # ('scaler', StandardScaler(with_mean=False)), # без него лучше
    ('classifier', LogisticRegression(random_state=42))
])

In [None]:
from sklearn.model_selection import train_test_split

y = imdb_data_cleared['labels']

for column in ['review', 'lem', 'stem']:

    X = imdb_data_cleared[column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"{column} score is {accuracy}")

review score is 0.8802222222222222
lem score is 0.8768888888888889
stem score is 0.8753333333333333


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

### Random forest

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier())
])


param_grid = {
    'tfidf__max_features': [1000, 2000, 3000],
    'clf__n_estimators': [50, 100, 200],
    'clf__max_depth': [None, 10, 20]
}


grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)


# XBoost

In [None]:
import xgboost
from xgboost import XGBClassifier

xgb_clf = XGBClassifier()
xgb_clf.fit(X_train_tfidf, y_train)


y_pred = xgb_clf.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
print("Your score is ", round(accuracy, 2))

# Результат

In [27]:
test_df = pd.read_csv('texts.csv', sep=',')

test_df['texts'] = test_df['texts'].apply(primary_processing)
test_df['texts'] = test_df['texts'].progress_apply(tokenize)
test_df['texts']= test_df['texts'].progress_apply(delete_stopwords)

test_df['texts']= test_df['texts'].apply(lambda tokens: ' '.join(tokens))

100%|██████████| 10000/10000 [00:19<00:00, 512.41it/s]
100%|██████████| 10000/10000 [00:12<00:00, 790.95it/s]


In [37]:
X_TRAIN= imdb_data_cleared['review']
Y_TRAIN = imdb_data_cleared['labels']

X_TEST = test_df['texts']

In [38]:
ftfidf_vectorizer = TfidfVectorizer()
X_TRAIN_TFIDF = ftfidf_vectorizer.fit_transform(X_TRAIN)
X_TEST_TFIDF = ftfidf_vectorizer.transform(X_TEST)

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

fclf_lr = LogisticRegression(random_state=42, max_iter = 10000)
fclf_lr.fit(X_TRAIN_TFIDF, Y_TRAIN)

y_pred = fclf_lr.predict(X_TEST_TFIDF)

In [40]:
results = le.inverse_transform(y_pred)

In [41]:
df = pd.DataFrame({'id': test_df['id'], 'labels': results})
df.to_csv('y_pred.csv', index=False)

In [None]:
df = pd.DataFrame({})
df.to_csv('y_pred.csv', index=False)