### https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72

In [1]:
%matplotlib inline   
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
LARGE_FIG_SIZE=(20, 15)

import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.preprocessing  import  OrdinalEncoder
from sklearn.model_selection import StratifiedShuffleSplit

In [2]:
df = pd.read_csv('./imdb_master.csv', encoding="latin-1")
df

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt
...,...,...,...,...,...
99995,99995,train,"Delightfully awful! Made by David Giancola, a ...",unsup,9998_0.txt
99996,99996,train,"Watching Time Chasers, it obvious that it was ...",unsup,9999_0.txt
99997,99997,train,At the beginning we can see members of Troma t...,unsup,999_0.txt
99998,99998,train,"The movie was incredible, ever since I saw it ...",unsup,99_0.txt


In [3]:
df = df.drop(['Unnamed: 0','file'],axis=1)
df

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,neg
1,test,This is an example of why the majority of acti...,neg
2,test,"First of all I hate those moronic rappers, who...",neg
3,test,Not even the Beatles could write songs everyon...,neg
4,test,Brass pictures (movies is not a fitting word f...,neg
...,...,...,...
99995,train,"Delightfully awful! Made by David Giancola, a ...",unsup
99996,train,"Watching Time Chasers, it obvious that it was ...",unsup
99997,train,At the beginning we can see members of Troma t...,unsup
99998,train,"The movie was incredible, ever since I saw it ...",unsup


In [4]:
df.label.unique()

array(['neg', 'pos', 'unsup'], dtype=object)

In [12]:
df = df[df.label != 'unsup']
df

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,0
1,test,This is an example of why the majority of acti...,0
2,test,"First of all I hate those moronic rappers, who...",0
3,test,Not even the Beatles could write songs everyon...,0
4,test,Brass pictures (movies is not a fitting word f...,0
...,...,...,...
49995,train,"Seeing as the vote average was pretty low, and...",1
49996,train,"The plot had some wretched, unbelievable twist...",1
49997,train,I am amazed at how this movie(and most others ...,1
49998,train,A Christmas Together actually came before my t...,1


In [None]:
df.label.unique()

In [6]:
## Map Movie review emotion : Postive = 1, Negatif = 0
df['label'] = df['label'].map({'pos': 1, 'neg': 0})
df.label.unique()

array([0, 1])

In [7]:
df.head()

Unnamed: 0,type,review,label
0,test,Once again Mr. Costner has dragged out a movie...,0
1,test,This is an example of why the majority of acti...,0
2,test,"First of all I hate those moronic rappers, who...",0
3,test,Not even the Beatles could write songs everyon...,0
4,test,Brass pictures (movies is not a fitting word f...,0


In [8]:
df.type.unique()

array(['test', 'train'], dtype=object)

Unnamed: 0,level_0,index,type,review,label
0,0,25000,train,Story of a man who has unnatural feelings for ...,0
1,1,25001,train,Airport '77 starts as a brand new luxury 747 p...,0
2,2,25002,train,This film lacked something I couldn't put my f...,0
3,3,25003,train,"Sorry everyone,,, I know this is supposed to b...",0
4,4,25004,train,When I was little my parents took me along to ...,0
...,...,...,...,...,...
24995,24995,49995,train,"Seeing as the vote average was pretty low, and...",1
24996,24996,49996,train,"The plot had some wretched, unbelievable twist...",1
24997,24997,49997,train,I am amazed at how this movie(and most others ...,1
24998,24998,49998,train,A Christmas Together actually came before my t...,1


In [9]:
X_train = df.query('type == "train"')
X_train.reset_index(inplace=True)
X_train.drop(['type', 'index'], inplace=True, axis=1)
Y_train = X_train.index
print(X_train.shape, Y_train.shape)
X_train 

(25000, 2) (25000,)


Unnamed: 0,review,label
0,Story of a man who has unnatural feelings for ...,0
1,Airport '77 starts as a brand new luxury 747 p...,0
2,This film lacked something I couldn't put my f...,0
3,"Sorry everyone,,, I know this is supposed to b...",0
4,When I was little my parents took me along to ...,0
...,...,...
24995,"Seeing as the vote average was pretty low, and...",1
24996,"The plot had some wretched, unbelievable twist...",1
24997,I am amazed at how this movie(and most others ...,1
24998,A Christmas Together actually came before my t...,1


In [11]:
X_test = df.query('type == "test"')
X_test.reset_index(inplace=True)
X_test.drop(['type', 'index'], inplace=True, axis=1)
Y_test = X_test.index 
print(X_test.shape, Y_test.shape)
X_test

(25000, 2) (25000,)


Unnamed: 0,review,label
0,Once again Mr. Costner has dragged out a movie...,0
1,This is an example of why the majority of acti...,0
2,"First of all I hate those moronic rappers, who...",0
3,Not even the Beatles could write songs everyon...,0
4,Brass pictures (movies is not a fitting word f...,0
...,...,...
24995,I was extraordinarily impressed by this film. ...,1
24996,"Although I'm not a golf fan, I attended a snea...",1
24997,"From the start of ""The Edge Of Love"", the view...",1
24998,"This movie, with all its complexity and subtle...",1


#### Distrubution 50/50 => Is this acceptable? 

### Word Treatement

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=5, max_df=0.5, ngram_range=(1, 2))
tfidf

TfidfVectorizer(max_df=0.5, min_df=5, ngram_range=(1, 2))