In [1]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import tqdm
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

nltk.download(['punkt', 'wordnet', 'stopwords'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
df = pd.read_csv('/gdrive/My Drive/kaggle-imdb/imdb_master.csv',encoding="latin-1", index_col=0)

In [4]:
df.head()

Unnamed: 0,type,review,label,file
0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [5]:
df = df[df['label'] != 'unsup']

stop_words = set(stopwords.words("english"))  
lemmatizer = WordNetLemmatizer()

def preprocessing(sentens):
  description = re.sub("[^a-zA-Z]"," ",sentens)
  description = description.lower()
  description = [lemmatizer.lemmatize(token) for token in description.split(" ")]
  description = [lemmatizer.lemmatize(token, "v") for token in description]
  description = [word for word in description if not word in stop_words]
  description = " ".join(description)
  return description

description_list = []
for sentens in tqdm.tqdm_notebook(df['review'].values):
  description_list.append(preprocessing(sentens))

HBox(children=(IntProgress(value=0, max=50000), HTML(value='')))




In [0]:
df['text'] = description_list
df['label'] = (df['label'] == 'pos').astype(np.int)


df_train = df[df['type'] == 'train'].drop(columns=['type', 'file', 'review'])
df_test = df[df['type'] == 'test'].drop(columns=['type', 'file', 'review'])

In [7]:
df_train.head()

Unnamed: 0,label,text
25000,0,story man ha unnatural feel pig start open sc...
25001,0,airport start brand new luxury plane l...
25002,0,film lack something put finger first charisma...
25003,0,sorry everyone know suppose art film wo...
25004,0,wa little parent take along theater see interi...


In [8]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=5000)
vectorizer.fit(df_train['text'].values)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=5000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [0]:
X_train = vectorizer.transform(df_train['text']).toarray()
X_test = vectorizer.transform(df_test['text']).toarray()

y_train = df_train['label'].values
y_test = df_test['label'].values

In [10]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, n_jobs=-1, )

model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [11]:
from sklearn.metrics import accuracy_score, f1_score

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print(f"Train acc: {accuracy_score(y_train, y_train_pred)} f1: {f1_score(y_train, y_train_pred)}")
print(f"Test acc: {accuracy_score(y_test, y_test_pred)} f1: {f1_score(y_test, y_test_pred)}")

Train acc: 1.0 f1: 1.0
Test acc: 0.8452 f1: 0.8437752300984983
