In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
imdb = pd.read_csv('./data_files/labeledTrainData.tsv', delimiter='\t')
imdb

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
...,...,...,...
24995,3453_3,0,It seems like more consideration has gone into...
24996,5064_1,0,I don't believe they made this film. Completel...
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil..."
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...


In [5]:
imdb.review[20]

'\\Soylent Green\\" is one of the best and most disturbing science fiction movies of the 70\'s and still very persuasive even by today\'s standards. Although flawed and a little dated, the apocalyptic touch and the environmental premise (typical for that time) still feel very unsettling and thought-provoking. This film\'s quality-level surpasses the majority of contemporary SF flicks because of its strong cast and some intense sequences that I personally consider classic. The New York of 2022 is a depressing place to be alive, with over-population, unemployment, an unhealthy climate and the total scarcity of every vital food product. The only form of food available is synthetic and distributed by the Soylent company. Charlton Heston (in a great shape) plays a cop investigating the murder of one of Soylent\'s most eminent executives and he stumbles upon scandals and dark secrets... The script is a little over-sentimental at times and the climax doesn\'t really come as a big surprise, st

In [9]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
stw = stopwords.words('english')

In [10]:
def eng_text_cleaning(text):
    re.sub('[^a-zA-Z]', ' ', text)
    text.lower()
    text = text.split()
    text = [t for t in text if t not in stw]
    text = [stemmer.stem(t) for t in text]
    text = ' '.join(text)
    return text

In [19]:
from tqdm import tqdm

clean_review = []

for r in tqdm(imdb.review):
    clean_review.append(eng_text_cleaning(r))

clean_review

100%|███████████████████████████████████████████████████████████████████████████| 25000/25000 [01:02<00:00, 401.21it/s]


["with stuff go moment mj i'v start listen music, watch odd documentari there, watch the wiz watch moonwalk again. mayb want get certain insight guy thought realli cool eighti mayb make mind whether guilti innocent. moonwalk part biography, part featur film rememb go see cinema origin released. some subtl messag mj' feel toward press also obviou messag drug bad m'kay.<br /><br />visual impress cours michael jackson unless remot like mj anyway go hate find boring. some may call mj egotist consent make movi but mj fan would say made fan true realli nice him.<br /><br />the actual featur film bit final start 20 minut exclud smooth crimin sequenc joe pesci convinc psychopath power drug lord. whi want mj dead bad beyond me. becaus mj overheard plans? nah, joe pesci' charact rant want peopl know suppli drug etc dunno, mayb hate mj' music.<br /><br />lot cool thing like mj turn car robot whole speed demon sequence. also, director must patienc saint came film kiddi bad sequenc usual director h

In [26]:
imdb['clean_review'] = clean_review

In [27]:
imdb.head()

Unnamed: 0,id,sentiment,review,clean_review
0,5814_8,1,With all this stuff going down at the moment w...,with stuff go moment mj i'v start listen music...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...","\the classic war worlds\"" timothi hine enterta..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,the film start manag (nichola bell) give welco...
3,3630_4,0,It must be assumed that those who praised this...,it must assum prais film (\the greatest film o...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,superbl trashi wondrous unpretenti 80' exploit...


In [28]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [29]:
vec = TfidfVectorizer(analyzer='word', min_df=2, ngram_range=(1, 2), max_df=0.9, max_features=10000)

In [32]:
review_vec = vec.fit_transform(imdb.clean_review)

In [36]:
review_df = pd.DataFrame(review_vec.toarray(), columns=vec.get_feature_names())



In [37]:
review_df.head()

Unnamed: 0,00,000,10,10 10,10 br,10 minut,10 star,10 stars,10 year,100,...,zane,zero,zizek,zombi,zombi movi,zombie,zombies,zone,zoom,zorro
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.072122,0.037772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
x = review_df
y = imdb.sentiment

In [39]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [41]:
from sklearn.model_selection import RandomForestClassifier()
rfc.fit(x_train, y_train)
preds = rfc.predict(x_test)



SyntaxError: invalid syntax (Temp/ipykernel_12240/2131327435.py, line 1)

In [None]:
from skleran.metrics import classification_report
print(classification_report(y_test, preds))