### SOP EVALUATION USING ML ALGORITHMS

##### Reading the dataset.

In [6]:
import pandas as pd
df = pd.read_excel('SOPs Dataset.xlsx')

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,,,,,
1,Sno,,SOP,,Label
2,,,,,
3,1,,To the Admissions Committee of University of B...,,
4,2,,Admission Committee of the University of Toron...,,


##### Data Cleaning

In [8]:
column_labels = ['Column1', 'Column2', 'SOP','Column4','Label']
df.columns = column_labels
display(df)

Unnamed: 0,Column1,Column2,SOP,Column4,Label
0,,,,,
1,Sno,,SOP,,Label
2,,,,,
3,1,,To the Admissions Committee of University of B...,,
4,2,,Admission Committee of the University of Toron...,,
...,...,...,...,...,...
816,796,,"Dear Admissions Committee,\n\nI am Nisha Gupta...",,1
817,797,,"Dear Admissions Committee,\n\nI am Anirudh Kap...",,1
818,798,,"Dear Admissions Committee,\n\nI am Priyanka Pa...",,1
819,799,,"Dear Admissions Committee,\n\nI am Arjun Mehta...",,1


In [9]:
# Dropping unwanted columns and displaying the dataset
columns_to_drop = ['Column1', 'Column2', 'Column4']
df.drop(columns=columns_to_drop, inplace=True)
display(df)

Unnamed: 0,SOP,Label
0,,
1,SOP,Label
2,,
3,To the Admissions Committee of University of B...,
4,Admission Committee of the University of Toron...,
...,...,...
816,"Dear Admissions Committee,\n\nI am Nisha Gupta...",1
817,"Dear Admissions Committee,\n\nI am Anirudh Kap...",1
818,"Dear Admissions Committee,\n\nI am Priyanka Pa...",1
819,"Dear Admissions Committee,\n\nI am Arjun Mehta...",1


In [10]:
# Drop the first 3 since it has no values
df = df.drop(df.index[:3])
df.head(5)

Unnamed: 0,SOP,Label
3,To the Admissions Committee of University of B...,
4,Admission Committee of the University of Toron...,
5,Dear Admissions Committee of McGill University...,
6,To the Esteemed Admissions Panel of McMaster U...,
7,Dear Admissions Committee of the University of...,


In [11]:
# Reset the index to 0
df = df.reset_index(drop=True)
df.head(5)

Unnamed: 0,SOP,Label
0,To the Admissions Committee of University of B...,
1,Admission Committee of the University of Toron...,
2,Dear Admissions Committee of McGill University...,
3,To the Esteemed Admissions Panel of McMaster U...,
4,Dear Admissions Committee of the University of...,


In [12]:
# Replace all 1's with 0 in the 'Label' column
df['Label'] = df['Label'].replace(1, 0)
df.head(10)

Unnamed: 0,SOP,Label
0,To the Admissions Committee of University of B...,
1,Admission Committee of the University of Toron...,
2,Dear Admissions Committee of McGill University...,
3,To the Esteemed Admissions Panel of McMaster U...,
4,Dear Admissions Committee of the University of...,
5,To the Respected Admissions Board of the Unive...,
6,Dear Admissions Committee at the University of...,
7,To the Esteemed Admissions Panel of McMaster U...,
8,"Dear Admissions Board of Queen's University,\n...",
9,To the Admission Committee of the University o...,


In [13]:
# Replace all NaN values with 1 in the 'Label' column
df['Label'].fillna(1, inplace=True)
df.head(100)

Unnamed: 0,SOP,Label
0,To the Admissions Committee of University of B...,1.0
1,Admission Committee of the University of Toron...,1.0
2,Dear Admissions Committee of McGill University...,1.0
3,To the Esteemed Admissions Panel of McMaster U...,1.0
4,Dear Admissions Committee of the University of...,1.0
...,...,...
95,"To the Visa Officer,\nCanada High Commission, ...",0.0
96,"To the Visa Officer,\nCanada High Commission, ...",0.0
97,"To the Visa Officer,\nCanada High Commission, ...",0.0
98,"To the Visa Officer,\nCanada High Commission, ...",0.0


In [14]:
# Drop rows with null values in the 'SOP' column
df = df.dropna(subset=['SOP'])
df = df.reset_index(drop=True)

In [15]:
df.shape

(804, 2)

In [16]:
# Checking for missing values
df.isna()

Unnamed: 0,SOP,Label
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
799,False,False
800,False,False
801,False,False
802,False,False


In [17]:
# Check for null values and printing columns with null values and their respective counts
null_values = df.isnull().sum()
for column, count in null_values.items():
    if count > 0:
        print(f'Column: {column}, Null Count: {count}')

In [18]:
# Checking if the dataset is biased or not
df.Label.value_counts()

1.0     630
0.0     173
11.0      1
Name: Label, dtype: int64

In [19]:
# Replacing one label 11.0 with 1.0
df['Label'] = df['Label'].replace([11.0],1.0)
df['Label'].value_counts()

1.0    631
0.0    173
Name: Label, dtype: int64

In [20]:
df.Label.value_counts()

1.0    631
0.0    173
Name: Label, dtype: int64

Here we can understand that the count of approved SOP's are so high in number when compared to those which are rejected. 
Which creates a bias problem in the dataset.
So we need to perform augmentation to balance the dataset.

##### Augmentation

In [21]:
# Function to replace rejected SOP words with its synonyms and create augmented SOP

import nltk
from nltk.corpus import wordnet
import random

nltk.download('wordnet')

def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return synonyms

def synonym_replacement(text, n=1):
    words = text.split()
    augmented_texts = []
    for _ in range(n):
        augmented_words = []
        for word in words:
            synonyms = get_synonyms(word)
            if synonyms:
                synonym = random.choice(synonyms)
                augmented_words.append(synonym)
            else:
                augmented_words.append(word)
        augmented_text = ' '.join(augmented_words)
        augmented_texts.append(augmented_text)
    return str(augmented_texts)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\roysi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
#1st iteration for augmentation SOP's with label 0
rejected_sop = df[df['Label'] == 0]

aug_texts = []
for text in rejected_sop["SOP"]:
    aug_text = synonym_replacement(text)
    aug_texts.append(aug_text)

rejected_sop["SOP"] = aug_texts
rejected_sop.head()

df = pd.concat([df, rejected_sop], ignore_index=True)

df.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rejected_sop["SOP"] = aug_texts


(977, 2)

In [23]:
with pd.option_context('display.max_colwidth', None):
    print(rejected_sop.head(1))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [24]:
# Resetting the index
df = df.reset_index(level=0)
df['index'] = df.index

In [25]:
#2nd iteration for augmentation the SOP's with label 0
rejected_sop_1 = df[df['Label'] == 0] 
rejected_sop_1.SOP.count()

texts = []
for text in rejected_sop_1["SOP"]:
    aug_text = synonym_replacement(str(text))
    texts.append(aug_text)

rejected_sop_1["SOP"] = texts
rejected_sop.head()

df = pd.concat([df, rejected_sop], ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rejected_sop_1["SOP"] = texts


In [26]:
# Appending the augmented SOP's to the dataframe
new_sop =[]
for row in df.SOP:
    if type(row) == str:
        new_sop.append(row)
    else:
        concatenated_series = pd.concat(row)
        new_sop.append(', '.join(concatenated_series))

In [27]:
# Total SOP Count after augmentation
df['Label'].value_counts()

1.0    631
0.0    519
Name: Label, dtype: int64

##### NLP Preprocessing

In [28]:
import string

In [29]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [30]:
# Removing punctuation's
def remove_punctuations(text):
    punctuations = string.punctuation
    return text.translate(str.maketrans('','',punctuations))
df['SOP'] = df['SOP'].astype(str)

In [31]:
df['Clean_sop'] = df['SOP'].apply(lambda x: remove_punctuations(x))

In [32]:
df.Clean_sop.tail()

1145    heartfelt entrancefee Committee I be Nisha Gup...
1146    dear access Committee ace constitute Anirudh K...
1147    devout admission Committee I be Priyanka Patel...
1148    honey admission Committee single be Arjun Meht...
1149    dearly admission Committee I be Meera Sharma A...
Name: Clean_sop, dtype: object

In [33]:
from nltk.corpus import stopwords

In [34]:
STOPWORDS = set(stopwords.words("english"))

In [35]:
print(STOPWORDS)

{"you're", 'she', 'than', 'am', 'over', 'so', 'up', "should've", 'why', 'our', 'below', 'no', 'with', 'each', 's', 'wasn', 'who', 'didn', 'y', "isn't", 'most', 'out', 'myself', 'can', 'should', 'the', 'doing', 'an', 'just', 'being', 't', 'themselves', 'any', 'some', "mightn't", "that'll", 'him', 'because', 'their', 'his', 'does', 'hasn', "won't", 'a', 'to', 'shan', 'against', 'had', 'from', 'weren', "she's", 'into', 'theirs', 'where', 'after', 'very', 'too', 'about', 'won', 'you', 'how', 'under', 'be', 'couldn', 'hadn', 'ours', 'are', 'mightn', 'aren', 'whom', "shan't", 'it', 'there', 'm', 'll', 'when', 'd', "couldn't", "don't", 'but', "doesn't", "you've", 'this', "needn't", "weren't", 'been', 'my', 'itself', 'yourselves', 'as', 'her', 'before', 'that', 'these', 'once', 'for', 'same', 'those', "hasn't", "shouldn't", "wasn't", 'me', 'if', 'which', 'further', 'don', 'on', 'nor', "hadn't", 'mustn', 'i', 'own', 've', 'having', 'more', 'shouldn', 'isn', "aren't", 'is', 'few', 'what', 'hers'

In [36]:
# Removing stopwords
def remove_stopwords(text):
    return " ".join(word for word in text.split() if word not in STOPWORDS)

In [37]:
df["Clean_sop"] = df["Clean_sop"].apply(lambda x: remove_stopwords(x))

In [38]:
df.tail()

Unnamed: 0,index,SOP,Label,Clean_sop
1145,,"[""heartfelt entrance_fee Committee, I be Nisha...",0.0,heartfelt entrancefee Committee I Nisha Gupta ...
1146,,"[""dear access Committee, ace constitute Anirud...",0.0,dear access Committee ace constitute Anirudh K...
1147,,"[""devout admission Committee, I be Priyanka Pa...",0.0,devout admission Committee I Priyanka Patel ad...
1148,,"[""honey admission Committee, single be Arjun M...",0.0,honey admission Committee single Arjun Mehta A...
1149,,"[""dearly admission Committee, I be Meera Sharm...",0.0,dearly admission Committee I Meera Sharma A gi...


In [39]:
# Lower casing the SOP
df['Clean_sop'] = df['Clean_sop'].str.lower()

In [40]:
df.tail()

Unnamed: 0,index,SOP,Label,Clean_sop
1145,,"[""heartfelt entrance_fee Committee, I be Nisha...",0.0,heartfelt entrancefee committee i nisha gupta ...
1146,,"[""dear access Committee, ace constitute Anirud...",0.0,dear access committee ace constitute anirudh k...
1147,,"[""devout admission Committee, I be Priyanka Pa...",0.0,devout admission committee i priyanka patel ad...
1148,,"[""honey admission Committee, single be Arjun M...",0.0,honey admission committee single arjun mehta a...
1149,,"[""dearly admission Committee, I be Meera Sharm...",0.0,dearly admission committee i meera sharma a gi...


In [41]:
# Tokenizing the SOP text
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')
df['tokenized_text'] = df['Clean_sop'].apply(lambda x: word_tokenize(x))
print(df['tokenized_text'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\roysi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0       [to, admissions, committee, university, britis...
1       [admission, committee, university, toronto, gr...
2       [dear, admissions, committee, mcgill, universi...
3       [to, esteemed, admissions, panel, mcmaster, un...
4       [dear, admissions, committee, university, toro...
                              ...                        
1145    [heartfelt, entrancefee, committee, i, nisha, ...
1146    [dear, access, committee, ace, constitute, ani...
1147    [devout, admission, committee, i, priyanka, pa...
1148    [honey, admission, committee, single, arjun, m...
1149    [dearly, admission, committee, i, meera, sharm...
Name: tokenized_text, Length: 1150, dtype: object


In [42]:
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [43]:
# Lemmatizing the tokenized word with appropriate Parts Of Speech tagging 
lemmatizer = WordNetLemmatizer()

# Function to perform POS tagging and lemmatization
def pos_mapping_lemmatization(text):
    pos_tags = pos_tag(text)
    
    lemmatized_words = []
    for word, tag in pos_tags:
        if tag.startswith('N'):
            pos = 'n'
        elif tag.startswith('V'):
            pos = 'v'
        elif tag.startswith('R'):
            pos = 'r'
        else:
            pos = 'n'
        
        lemma = lemmatizer.lemmatize(word, pos)
        lemmatized_words.append(lemma)
    
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

In [44]:
df['lemmatized_text'] = df['tokenized_text'].apply(lambda x: pos_mapping_lemmatization(x))
df.head()

Unnamed: 0,index,SOP,Label,Clean_sop,tokenized_text,lemmatized_text
0,0.0,To the Admissions Committee of University of B...,1.0,to admissions committee university british col...,"[to, admissions, committee, university, britis...",to admission committee university british colu...
1,1.0,Admission Committee of the University of Toron...,1.0,admission committee university toronto greetin...,"[admission, committee, university, toronto, gr...",admission committee university toronto greetin...
2,2.0,Dear Admissions Committee of McGill University...,1.0,dear admissions committee mcgill university my...,"[dear, admissions, committee, mcgill, universi...",dear admission committee mcgill university my ...
3,3.0,To the Esteemed Admissions Panel of McMaster U...,1.0,to esteemed admissions panel mcmaster universi...,"[to, esteemed, admissions, panel, mcmaster, un...",to esteem admission panel mcmaster university ...
4,4.0,Dear Admissions Committee of the University of...,1.0,dear admissions committee university toronto m...,"[dear, admissions, committee, university, toro...",dear admission committee university toronto my...


#### MODEL TRAINING

In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [46]:
# Train-Validation Split
X_train, X_val, y_train, y_val = train_test_split(df['lemmatized_text'], df['Label'], test_size=0.2, random_state=42)

In [47]:
# Train and Validation set distribution of the dataset
lenSOP = df['lemmatized_text'].count()
print(f'Total Dataset: {lenSOP}')
print(f'Training Set: {len(X_train)}')
print(f'Validation Set: {len(X_val)}')

Total Dataset: 1150
Training Set: 920
Validation Set: 230


RandomForestClassifier using CountVectorizer

In [48]:
# Using Count Vectorizer to convert text to vectors
vectorizer_lda = CountVectorizer(min_df=2)  # Adjust min_df as needed
vectorizer_lda.fit(X_train)
X_train_cv = vectorizer_lda.transform(X_train)
X_val_cv = vectorizer_lda.transform(X_val)
print(f'Shape of train data: {X_train_cv.shape}')

Shape of train data: (920, 4598)


In [49]:
print(vectorizer_lda.get_feature_names_out()[100:120])

['acquire' 'acquirement' 'acquisition' 'across' 'acrosstheboard' 'act'
 'action' 'actionable' 'active' 'actively' 'activist' 'acton' 'actuarial'
 'actuary' 'actuate' 'actuation' 'acumen' 'acute' 'adani' 'adaptability']


In [59]:
# Model Training and Prediction for Count Vectorizer using random forest classifier
clf_cv = RandomForestClassifier(max_depth=2, max_leaf_nodes=3, n_estimators=20, class_weight={0:100, 1:60})
clf_cv.fit(X_train_cv, y_train)
y_pred_cv_train = clf_cv.predict(X_train_cv)
y_pred_cv_val = clf_cv.predict(X_val_cv)

# Model Evaluation for RFC
metrics = {'Training Set':
           {'Accuracy':accuracy_score(y_train, y_pred_cv_train),
            'Precision':precision_score(y_train, y_pred_cv_train),
            'Recall':recall_score(y_train, y_pred_cv_train),
            'F1_score':f1_score(y_train, y_pred_cv_train)},
           
           'Validation Set':
           {'Accuracy':accuracy_score(y_val, y_pred_cv_val),
            'Precision':precision_score(y_val, y_pred_cv_val),
            'Recall':recall_score(y_val, y_pred_cv_val),
            'F1_score':f1_score(y_val, y_pred_cv_val)}
            }

metricDf = pd.DataFrame.from_dict(metrics)
print('RandomForestClassifier using CountVectorizer')
display(metricDf)


RandomForestClassifier using CountVectorizer


Unnamed: 0,Training Set,Validation Set
Accuracy,0.934783,0.913043
Precision,0.944223,0.92
Recall,0.936759,0.92
F1_score,0.940476,0.92


RandomForestClassifier using TfidfVectorizer

In [51]:
# Using TF-IDF vectorizer for converting text to vectors
vectorizer_lsa = TfidfVectorizer()
vectorizer_lsa.fit(X_train)
X_train_tfidf = vectorizer_lsa.transform(X_train)
X_val_tfidf = vectorizer_lsa.transform(X_val)
print(f'Shape of train data: {X_train_tfidf.shape}')

Shape of train data: (920, 6078)


In [52]:
print(vectorizer_lsa.get_feature_names_out()[100:120])

['acceptation' 'accepted' 'access' 'accessibility' 'accessible'
 'accession' 'acclaim' 'accolade' 'accommodate' 'accompaniment'
 'accompany' 'accomplish' 'accomplished' 'accomplishment' 'accord'
 'accost' 'account' 'accounting' 'accredit' 'ace']


In [60]:
# Model Training and Prediction for TF-IDF using random forest classifier
clf_tfidf = RandomForestClassifier(max_depth=2, max_leaf_nodes=3, n_estimators=20, class_weight={0:100, 1:60})
clf_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf_train = clf_tfidf.predict(X_train_tfidf)
y_pred_tfidf_val = clf_tfidf.predict(X_val_tfidf)

# Model Evaluation for TF-IDF
metrics = {'Training Set':
           {'Accuracy':accuracy_score(y_train, y_pred_tfidf_train),
            'Precision':precision_score(y_train, y_pred_tfidf_train),
            'Recall':recall_score(y_train, y_pred_tfidf_train),
            'F1_score':f1_score(y_train, y_pred_tfidf_train)},
           
           'Validation Set':
           {'Accuracy':accuracy_score(y_val, y_pred_tfidf_val),
            'Precision':precision_score(y_val, y_pred_tfidf_val),
            'Recall':recall_score(y_val, y_pred_tfidf_val),
            'F1_score':f1_score(y_val, y_pred_tfidf_val)}
            }

metricDf = pd.DataFrame.from_dict(metrics)
print('RandomForestClassifier using TfidfVectorizer and TruncatedSVD')
display(metricDf)

RandomForestClassifier using TfidfVectorizer and TruncatedSVD


Unnamed: 0,Training Set,Validation Set
Accuracy,0.865217,0.873913
Precision,0.965854,0.961538
Recall,0.782609,0.8
F1_score,0.864629,0.873362


##### Hyperparameter Tuning

In [54]:
# Using Hyperparameter tuning to find best values for the random forest classifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_depth': [3, 6, 9, 20, 30],
    'max_leaf_nodes': [3, 6, 9, 20, 30]
}

rf_clf = RandomForestClassifier()

grid_search = GridSearchCV(rf_clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train_tfidf, y_train)

print("Best hyperparameters:", grid_search.best_params_)

best_rf_clf = RandomForestClassifier(**grid_search.best_params_)
best_rf_clf.fit(X_train_tfidf, y_train)

print("Training set score:", best_rf_clf.score(X_train_tfidf, y_train))
print("Test set score:", best_rf_clf.score(X_val_tfidf, y_val))

KeyboardInterrupt: 

RandomForestClassifier using CountVectorizer and LatentDirichletAllocation

In [61]:
# Specify the number of topics for LDA
num_topics_lda = 3000  # You can adjust this number as needed
lda = LatentDirichletAllocation(n_components=num_topics_lda, random_state=42)
lda.fit(X_train_cv)  # Fit LDA model for training data
X_train_topics_lda = lda.fit_transform(X_train_cv)
X_val_topics_lda = lda.transform(X_val_cv)  # Transform test data into topic distributions
print(f'Shape after dimensionality reduction of train data: {X_train_topics_lda.shape}')

Shape after dimensionality reduction of train data: (920, 3000)


In [62]:
# Model Training and Prediction for LDA
clf_lda = RandomForestClassifier(max_depth=20, max_leaf_nodes=30, n_estimators=25, class_weight={0:100, 1:60})
clf_lda.fit(X_train_topics_lda, y_train)
y_pred_lda_train = clf_lda.predict(X_train_topics_lda)
y_pred_lda_val = clf_lda.predict(X_val_topics_lda)

In [63]:
# Model Evaluation for LDA
metrics = {'Training Set':
           {'Accuracy':accuracy_score(y_train, y_pred_lda_train),
            'Precision':precision_score(y_train, y_pred_lda_train),
            'Recall':recall_score(y_train, y_pred_lda_train),
            'F1_score':f1_score(y_train, y_pred_lda_train)},
           
           'Validation Set':
           {'Accuracy':accuracy_score(y_val, y_pred_lda_val),
            'Precision':precision_score(y_val, y_pred_lda_val),
            'Recall':recall_score(y_val, y_pred_lda_val),
            'F1_score':f1_score(y_val, y_pred_lda_val)}
            }

metricDf = pd.DataFrame.from_dict(metrics)
print('RandomForestClassifier using CountVectorizer and LatentDirichletAllocation')
display(metricDf)

RandomForestClassifier using CountVectorizer and LatentDirichletAllocation


Unnamed: 0,Training Set,Validation Set
Accuracy,0.778261,0.917391
Precision,0.980892,0.889706
Recall,0.608696,0.968
F1_score,0.75122,0.927203


RandomForestClassifier using TfidfVectorizer and TruncatedSVD

In [64]:
# Specify the number of components for LSA
num_components_lsa = 3000  # You can adjust this number as needed
lsa = TruncatedSVD(n_components=num_components_lsa, random_state=42)
lsa.fit(X_train_tfidf)  # Fit LSA model for training data
X_train_components_lsa = lsa.fit_transform(X_train_tfidf)
X_val_components_lsa = lsa.transform(X_val_tfidf)  # Transform test data into components
print(f'Shape after dimensionality reduction of train data: {X_train_components_lsa.shape}')

Shape after dimensionality reduction of train data: (920, 920)


In [70]:
# Model Training and Prediction for LSA
clf_lsa = RandomForestClassifier(max_depth=20, max_leaf_nodes=30, n_estimators=25, class_weight={0:100, 1:10})
clf_lsa.fit(X_train_components_lsa, y_train)
y_pred_lsa_train = clf_lsa.predict(X_train_components_lsa)
y_pred_lsa_val = clf_lsa.predict(X_val_components_lsa)

In [71]:
# Model Evaluation for LSA
metrics = {'Training Set':
           {'Accuracy':accuracy_score(y_train, y_pred_lsa_train),
            'Precision':precision_score(y_train, y_pred_lsa_train),
            'Recall':recall_score(y_train, y_pred_lsa_train),
            'F1_score':f1_score(y_train, y_pred_lsa_train)},
           
           'Validation Set':
           {'Accuracy':accuracy_score(y_val, y_pred_lsa_val),
            'Precision':precision_score(y_val, y_pred_lsa_val),
            'Recall':recall_score(y_val, y_pred_lsa_val),
            'F1_score':f1_score(y_val, y_pred_lsa_val)}
            }

metricDf = pd.DataFrame.from_dict(metrics)
print('RandomForestClassifier using TfidfVectorizer and TruncatedSVD')
display(metricDf)

RandomForestClassifier using TfidfVectorizer and TruncatedSVD


Unnamed: 0,Training Set,Validation Set
Accuracy,0.995652,0.856522
Precision,1.0,0.933962
Recall,0.992095,0.792
F1_score,0.996032,0.857143


In [67]:
# Pickling ml model, vectorizer and dimensionality reduction models
import pickle

f = open('Deployment\Models\\vectorizerTFIDF.pkl', 'wb')
pickle.dump(vectorizer_lsa, f)
f.close()

f = open('Deployment\Models\\vectorizerCV.pkl', 'wb')
pickle.dump(vectorizer_lda, f)
f.close()

f = open('Deployment\Models\LDA.pkl', 'wb')
pickle.dump(lda, f)
f.close()

f = open('Deployment\Models\TSVD.pkl', 'wb')
pickle.dump(lsa, f)
f.close()

f = open('Deployment\Models\RF_CV_LDA.pkl', 'wb')
pickle.dump(clf_lda, f)
f.close()

f = open('Deployment\Models\RF_TFIDF_TSVD.pkl', 'wb')
pickle.dump(clf_lsa, f)
f.close()