In [1]:
import pandas as pd

train_df = pd.read_csv('../data/imdb_train.csv', header=0)

train_df.head(10)

Unnamed: 0,text,label
0,This movie makes me want to throw up every tim...,0
1,Listening to the director's commentary confirm...,0
2,One of the best Tarzan films is also one of it...,1
3,Valentine is now one of my favorite slasher fi...,1
4,No mention if Ann Rivers Siddons adapted the m...,0
5,Several years ago the Navy kept a studied dist...,1
6,This is a masterpiece footage in B/W 35mm film...,1
7,Such a long awaited movie.. But it has disappo...,0
8,When two writers make a screenplay of a horror...,1
9,"Make no mistake, Maureen O'Sullivan is easily ...",1


In [9]:
train_df.columns

Index(['text', 'label'], dtype='object')

In [2]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    25000 non-null  object
 1   label   25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


In [3]:
train_df.isna().sum()

text     0
label    0
dtype: int64

Sequence Of Steps

<ul>
    <li>Preprocess the strings</li>
    <li>split the data</li>
    <li>create the data pipelines</li>
    <li>train the model</li>
    <li>get the metrics</li>
    <li>finally compare the models</li>
</ul>

2. Train a Classifier
You can work with these models:
<ul>
<li>LogisticRegression from here: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html</li>
<li>DecisionTreeClassifier from here: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html</li>
<li>RandomForestClassifier from here: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html</li>
<li>GridSearchCV from here: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html</li>
<li>RandomizedSearchCV from here: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html </li>
</ul>

In [4]:
train_df["label"].value_counts()

1    12500
0    12500
Name: label, dtype: int64

In [5]:
# Install the library and functions
import nltk

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ravikiran.bhonagiri\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ravikiran.bhonagiri\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
import nltk, re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

# Let's get a list of stop words from the NLTK library
stop = stopwords.words('english')

# These words are important for our problem. We don't want to remove them.
excluding = ['against', 'not', 'don', "don't",'ain', 'aren', "aren't", 'couldn', "couldn't",
             'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 
             'haven', "haven't", 'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't",
             'needn', "needn't",'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', 
             "weren't", 'won', "won't", 'wouldn', "wouldn't"]

# New stop word list
stop_words = [word for word in stop if word not in excluding]

snow = SnowballStemmer('english')

def process_text(texts): 
    final_text_list=[]
    for sent in texts:
        
        # Check if the sentence is a missing value
        if isinstance(sent, str) == False:
            sent = ""
            
        filtered_sentence=[]
        
        sent = sent.lower() # Lowercase 
        sent = sent.strip() # Remove leading/trailing whitespace
        sent = re.sub('\s+', ' ', sent) # Remove extra space and tabs
        sent = re.compile('<.*?>').sub('', sent) # Remove HTML tags/markups:
        
        for w in word_tokenize(sent):
            # We are applying some custom filtering here, feel free to try different things
            # Check if it is not numeric and its length>2 and not in stop words
            if(not w.isnumeric()) and (len(w)>2) and (w not in stop_words):  
                # Stem and add to filtered list
                filtered_sentence.append(snow.stem(w))
        final_string = " ".join(filtered_sentence) #final string of cleaned words
 
        final_text_list.append(final_string)
        
    return final_text_list

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_df[["text"]],
                                                  train_df["label"],
                                                  test_size=0.10,
                                                  shuffle=True,
                                                  random_state=360
                                                 )

In [13]:
X_train.columns

Index(['text'], dtype='object')

In [14]:
print("Processing the text fields")
X_train["text"] = process_text(X_train["text"].tolist())
X_val["text"] = process_text(X_val["text"].tolist())

Processing the text fields


In [16]:

# Grab model features/inputs and target/output
numerical_features = []

text_features = ['text']

model_features = numerical_features + text_features
model_target = 'label'

In [17]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

### COLUMN_TRANSFORMER ###
##########################

'''

# Preprocess the numerical features
numerical_processor = Pipeline([
    ('num_scaler', MinMaxScaler())
])

# Preprocess 1st text feature
text_processor_0 = Pipeline([
    ('text_vect_0', CountVectorizer(binary=True, max_features=50))
])

# Preprocess 2nd text feature (larger vocabulary)
text_precessor_1 = Pipeline([
    ('text_vect_1', CountVectorizer(binary=True, max_features=150))
])

# Combine all data preprocessors from above (add more, if you choose to define more!)
# For each processor/step specify: a name, the actual process, and finally the features to be processed
data_preprocessor = ColumnTransformer([
    ('numerical_pre', numerical_processor, numerical_features),
    ('text_pre_0', text_processor_0, text_features[0]),
    ('text_pre_1', text_precessor_1, text_features[1])
]) 


'''


text_processor_label = Pipeline([
    ('text_vect', CountVectorizer(binary=True, max_features=50))
])

data_preprocessor = ColumnTransformer([
    ('text_pre_l', text_processor_label, text_features[0])
])



In [18]:
pipeline = Pipeline([
    ('data_preprocessing', data_preprocessor),
    ('logistic_regression', LogisticRegression(penalty = 'l2',
                              C = 0.1))
                    ])

from sklearn import set_config
set_config(display='diagram')
pipeline

pipeline.fit(X_train, y_train.values)

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Use the fitted pipeline to make predictions on the validation dataset
val_predictions = pipeline.predict(X_val)
print(confusion_matrix(y_val.values, val_predictions))
print(classification_report(y_val.values, val_predictions))
print("Accuracy (validation):", accuracy_score(y_val.values, val_predictions))

[[876 394]
 [344 886]]
              precision    recall  f1-score   support

           0       0.72      0.69      0.70      1270
           1       0.69      0.72      0.71      1230

    accuracy                           0.70      2500
   macro avg       0.71      0.71      0.70      2500
weighted avg       0.71      0.70      0.70      2500

Accuracy (validation): 0.7048


In [19]:
from sklearn.tree import DecisionTreeClassifier

pipeline = Pipeline([
    ('data_preprocessing', data_preprocessor),
    ('decision_tree_classifier', DecisionTreeClassifier(max_depth = 10,
                                             min_samples_leaf = 15))
                    ])

from sklearn import set_config
set_config(display='diagram')
pipeline

pipeline.fit(X_train, y_train.values)

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Use the fitted pipeline to make predictions on the validation dataset
val_predictions = pipeline.predict(X_val)
print(confusion_matrix(y_val.values, val_predictions))
print(classification_report(y_val.values, val_predictions))
print("Accuracy (validation):", accuracy_score(y_val.values, val_predictions))

[[777 493]
 [339 891]]
              precision    recall  f1-score   support

           0       0.70      0.61      0.65      1270
           1       0.64      0.72      0.68      1230

    accuracy                           0.67      2500
   macro avg       0.67      0.67      0.67      2500
weighted avg       0.67      0.67      0.67      2500

Accuracy (validation): 0.6672


In [None]:
from sklearn.model_selection import GridSearchCV

### PIPELINE GRID_SEARCH ###
############################

# Parameter grid for GridSearch
param_grid={'decision_tree__max_depth': [15, 25, 35], #, 45, 55, 75], 
            'decision_tree__min_samples_leaf': [5, 10, 15], #, 15, 30],
           }

grid_search = GridSearchCV(pipeline, # Base model
                           param_grid, # Parameters to try
                           cv = 5, # Apply 5-fold cross validation
                           verbose = 1, # Print summary
                           n_jobs = -1 # Use all available processors
                          )

# Fit the GridSearch to our training data
grid_search.fit(X_train, y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

In [20]:
from sklearn.ensemble import RandomForestClassifier


pipeline = Pipeline([
    ('data_preprocessing', data_preprocessor),
    ('random_forest_classifier', RandomForestClassifier(n_estimators=150,
                                             max_depth = 10,
                                             min_samples_leaf = 15))
                    ])

from sklearn import set_config
set_config(display='diagram')
pipeline

pipeline.fit(X_train, y_train.values)

from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# Use the fitted pipeline to make predictions on the validation dataset
val_predictions = pipeline.predict(X_val)
print(confusion_matrix(y_val.values, val_predictions))
print(classification_report(y_val.values, val_predictions))
print("Accuracy (validation):", accuracy_score(y_val.values, val_predictions))

[[866 404]
 [327 903]]
              precision    recall  f1-score   support

           0       0.73      0.68      0.70      1270
           1       0.69      0.73      0.71      1230

    accuracy                           0.71      2500
   macro avg       0.71      0.71      0.71      2500
weighted avg       0.71      0.71      0.71      2500

Accuracy (validation): 0.7076


In [None]:
import pandas as pd

test_df = pd.read_csv('../data/imdb_test.csv', header=0)
test_df.head(10)

In [None]:
test_df.isna().sum()

In [None]:
test_df.info()