<a href="https://colab.research.google.com/github/VijayRameshkumar/AppliedAI/blob/main/09_DecisionTree/practice/RandomForest_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas==1.1.3

In [None]:
import pickle

glove_vectors_path = '/content/drive/MyDrive/6_Donors_choose_NB/glove_vectors'
preprocessed_data = '/content/drive/MyDrive/6_Donors_choose_NB/preprocessed_data.csv'

# **TASK 1:** 

##**`RandomForest Classifier`**

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

pd.set_option('display.width', 10)
pd.set_option('display.max_colwidth', 10)

import warnings
warnings.filterwarnings("ignore")

import nltk
nltk.download('vader_lexicon')

## **1.1 LoadingData**

In [None]:
data = pd.read_csv(preprocessed_data)
data = pd.concat([data.loc[data['project_is_approved'] == 0], data.loc[data['project_is_approved'] == 1].head(33458)], sort=False)

target = data['project_is_approved']
data = data.drop(columns=['project_is_approved'])

data.info() #basic info about dataset : To know how many categorical and numeric data point

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, ytrain, ytest = train_test_split(data, target, test_size=0.33, stratify=target)

del data
del target

## **1.2 TextFeatures Encoding**



### **1.2.1 Tfidf Vectorization**

In [None]:
############### TFIDF - Vectorizer ######################
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df=10, max_features=5000, ngram_range=(1,2))
tfidf.fit(X_train.essay.values)

### **1.2.2 Tfidf-W2V vectorizer**

In [None]:
################# TFIDF - W2V ###########################
import tqdm

with open(glove_vectors_path, 'rb') as f:
    model = pickle.load(f)
    glove_words =  set(model.keys())

def tfidf_w2v(essay):
    tfidf = TfidfVectorizer(min_df=10, max_features=5000, ngram_range=(1,2))
    tfidf.fit(essay.values)

    dictionary = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
    tfidf_words = set(tfidf.get_feature_names())

    tfidf_w2v_vectors = []

    for sentence in essay:
        vector = np.zeros(300)
        tfidf_weight = 0
        for word in sentence.split():
            if (word in glove_words) and (word in tfidf_words):
                vec = model[word]
                tfidf_vec = dictionary[word] * sentence.count(word) / len(sentence.split())
                vector += vec * tfidf_vec
                tfidf_weight += tfidf_vec
                
        if tfidf_weight != 0:
            vector /= tfidf_vec
        tfidf_w2v_vectors.append(vector)

    return np.array(tfidf_w2v_vectors)

## **1.3 Categorical Feature Encoding & Normalize Numeric Data**

In [None]:
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import CountVectorizer

ohe1 = CountVectorizer()
ohe1.fit(X_train['school_state'].values)
school_state_ohe_train = ohe1.transform(X_train['school_state'].values).toarray() # fit has to happen only on train data
school_state_ohe_test = ohe1.transform(X_test['school_state'].values).toarray()

ohe2 = CountVectorizer()
ohe2.fit(X_train['teacher_prefix'].values)
teacher_prefix_ohe_train = ohe2.transform(X_train['teacher_prefix'].values).toarray() # fit has to happen only on train data
teacher_prefix_ohe_test = ohe2.transform(X_test['teacher_prefix'].values).toarray()

ohe3 = CountVectorizer()
ohe3.fit(X_train['project_grade_category'].values)
project_grade_category_ohe_train = ohe3.transform(X_train['project_grade_category'].values).toarray() # fit has to happen only on train data
project_grade_category_ohe_test = ohe3.transform(X_test['project_grade_category'].values).toarray()

ohe4 = CountVectorizer()
ohe4.fit(X_train['clean_categories'].values)
clean_categories_ohe_train = ohe4.transform(X_train['clean_categories'].values).toarray() # fit has to happen only on train data
clean_categories_ohe_test = ohe4.transform(X_test['clean_categories'].values).toarray()

ohe5 = CountVectorizer()
ohe5.fit(X_train['clean_subcategories'].values)
clean_subcategories_ohe_train = ohe5.transform(X_train['clean_subcategories'].values).toarray() # fit has to happen only on train data
clean_subcategories_ohe_test = ohe5.transform(X_test['clean_subcategories'].values).toarray()

norm = Normalizer()
norm.fit(X_train['price'].values.reshape(-1,1))
X_train_price_norm = norm.transform(X_train['price'].values.reshape(-1,1))
X_test_price_norm = norm.transform(X_test['price'].values.reshape(-1,1))

norm.fit(X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
X_train_no_prev_proj = norm.transform(X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))
X_test_no_prev_proj = norm.transform(X_test['teacher_number_of_previously_posted_projects'].values.reshape(-1,1))

In [None]:
import gc

X_train_essay = X_train['essay']
X_test_essay = X_test['essay']

X_train = np.column_stack((X_train_no_prev_proj, X_train_price_norm, school_state_ohe_train, teacher_prefix_ohe_train, project_grade_category_ohe_train, clean_categories_ohe_train, clean_subcategories_ohe_train))
X_test = np.column_stack((X_test_no_prev_proj, X_test_price_norm, school_state_ohe_test, teacher_prefix_ohe_test, project_grade_category_ohe_test, clean_categories_ohe_test, clean_subcategories_ohe_test))

gc.collect()

print("Final Data matrix")
print(X_train.shape, ytrain.shape)
print(X_test.shape, ytest.shape)

print("="*100)

## **1.4 Sentiment Analyser**

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def sentiment_anayser(essay):
    sid = SentimentIntensityAnalyzer()
    negative = []
    positive = []
    neutral = []
    for sentence in essay:
        ss = sid.polarity_scores(sentence)
        sentmnt = list(ss.values())
        neg = sentmnt[0]
        neu = sentmnt[1]
        pos = sentmnt[2]
        negative.append(neg)
        neutral.append(neu)
        positive.append(pos)
    return np.column_stack((np.array(negative), np.array(neutral), np.array(positive)))

## **1.5 Hyper_Parameter Tuning** (SET1 & SET2)

In [None]:
from sklearn.model_selection import GridSearchCV

sets = ['tfidf_w2v_vectors', 'tfidf']
scorer = dict()

X_train = np.column_stack((X_train, sentiment_anayser(X_train_essay)))

for index, df_set in enumerate(sets):

    if df_set == 'tfidf':
        temp = tfidf.transform(X_train_essay).toarray()
        X = np.column_stack((X_train, temp))
        
    elif df_set == 'tfidf_w2v_vectors': 
        temp = tfidf_w2v(X_train_essay)
        X = np.column_stack((X_train, temp))
    
    print("Final Data matrix")
    print(X.shape, ytrain.shape)

    parameters={'max_depth' : [1, 5, 10, 50], 'min_samples_split' : [5, 10, 100, 500]}

    gsc=GridSearchCV(estimator=RandomForestClassifier(random_state=2),
                     param_grid=parameters, scoring='roc_auc', verbose=1, n_jobs=2, return_train_score=True)
    
    grid_result = gsc.fit(X, ytrain)
    scorer[df_set] = grid_result.cv_results_

    print("#"*50,"\n\n")
    print("\n", df_set, " : ", "\n")

    best_params=grid_result.best_params_
    print(best_params)

    print(grid_result.best_score_,"\n")
    print("#"*50,"\n\n")

    del X
    del temp
    gc.collect()

In [None]:
scorer['tfidf_w2v_vectors']

In [None]:
scorer['tfidf']

## **1.6 Cross-Validation Results**

In [None]:
mean_test_score = [0.65820944, 0.65820944, 0.65820944, 0.65820944, 0.68122492, 0.68113072, 0.68077205, 0.68077527, 
                   0.69357459, 0.69372878, 0.69341197, 0.69277492, 0.70191902, 0.70235819, 0.70424156, 0.70644806]

mean_train_score = [0.66895523, 0.66895523, 0.66895523, 0.66895523, 0.73153556, 0.73019983, 0.72432353, 0.71632896, 
                    0.82714781, 0.82142404, 0.7903215 , 0.75914939, 0.9999747 , 0.99989934, 0.99099531, 0.91503008]

params = [{'max_depth': 1, 'min_samples_split': 5},
  {'max_depth': 1, 'min_samples_split': 10},
  {'max_depth': 1, 'min_samples_split': 100},
  {'max_depth': 1, 'min_samples_split': 500},
  {'max_depth': 5, 'min_samples_split': 5},
  {'max_depth': 5, 'min_samples_split': 10},
  {'max_depth': 5, 'min_samples_split': 100},
  {'max_depth': 5, 'min_samples_split': 500},
  {'max_depth': 10, 'min_samples_split': 5},
  {'max_depth': 10, 'min_samples_split': 10},
  {'max_depth': 10, 'min_samples_split': 100},
  {'max_depth': 10, 'min_samples_split': 500},
  {'max_depth': 50, 'min_samples_split': 5},
  {'max_depth': 50, 'min_samples_split': 10},
  {'max_depth': 50, 'min_samples_split': 100},
  {'max_depth': 50, 'min_samples_split': 500}]

In [None]:
max_depths = []
min_samples_split = []

for parameter in params:
    max_depths.append(parameter['max_depth'])
    min_samples_split.append(parameter['min_samples_split'])

df = pd.DataFrame()
df['max_depth'] = pd.Series(max_depths)
df['min_samples_split'] = pd.Series(min_samples_split)
df['mean_test_score'] = mean_test_score
df['mean_train_score'] = mean_train_score

train_heatmap = df.pivot(index='min_samples_split', columns='max_depth', values='mean_train_score')
test_heatmap = df.pivot(index='min_samples_split', columns='max_depth', values='mean_test_score')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(train_heatmap, annot=True)
plt.title("Train_Heatmap")
plt.show()

sns.heatmap(test_heatmap, annot=True)
plt.title("Test_Heatmap")
plt.show()

## **1.7 Model Train & Predict**

In [None]:
X_train = np.column_stack((X_train, sentiment_anayser(X_train_essay)))
temp = tfidf.transform(X_train_essay).toarray()
X_train = np.column_stack((X_train, temp))

X_test = np.column_stack((X_test, sentiment_anayser(X_test_essay)))
temp = tfidf.transform(X_test_essay).toarray()
X_test = np.column_stack((X_test, temp))

X_train.shape, X_test.shape

In [None]:
model = RandomForestClassifier(max_depth = 50, min_samples_split= 500, random_state=2)

model = model.fit(X_train, ytrain)
Y_pred = model.predict(X_test)

## **1.8 ConfusionMatrix**

In [None]:
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.pyplot as plt

font = {
'family' : 'DejaVu Sans',
'weight' : 'bold',
'size' : '16'
}

plt.rc('font', **font)
mat = confusion_matrix(ytest, Y_pred)
plot_confusion_matrix(conf_mat=mat, figsize=(5,5), show_normed=True);

## **1.9 AUC-Plot**

In [None]:
from sklearn.metrics import auc

print("train_roc_auc_score : " , roc_auc_score(ytrain, model.predict(X_train)), '\n')
print("test_roc_auc_score : ", roc_auc_score(ytest, Y_pred), '\n')

probs = model.predict_proba(X_train)
probs = probs[:, 1]

train_fpr, train_tpr, train_thresholds = roc_curve(ytrain, probs)

probs = model.predict_proba(X_test)
probs = probs[:, 1]

test_fpr, test_tpr, test_thresholds = roc_curve(ytest, probs)

print("train_auc_score : " , auc(train_fpr, train_tpr), '\n')
print("test_auc_score : ", auc(test_fpr, test_tpr), '\n')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="ticks")
sns.set(style='darkgrid')

print("train_auc_score : " , auc(train_fpr, train_tpr), "\n\n")
print("test_auc_score : ", auc(test_fpr, test_tpr), "\n\n")

plt.plot(train_fpr, train_tpr, color='orange', label='_train_ROC')
plt.plot(test_fpr, test_tpr, color='green', label='_test_ROC')

plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(['train_AUC', 'test_AUC', 'AUC_Boundary'])
plt.show();

## **1.10 False-Positive Prediction WordCloud**

In [None]:
from wordcloud import WordCloud, STOPWORDS

Y_pred = Y_pred.tolist()

ytest = ytest.tolist()

false_positive = []

for index in range(len(Y_pred)):
    if ytest[index] == 0 and Y_pred[index] == 1:
        false_positive.append(index)

In [None]:
data = pd.read_csv(preprocessed_data)
data = pd.concat([data.loc[data['project_is_approved'] == 0], data.loc[data['project_is_approved'] == 1].head(33458)], sort=False)
data = data.drop(columns=['project_is_approved'])

fp_essay = data.iloc[false_positive]['essay']
fp_price = data.iloc[false_positive]['price']
fp_teacher_number_of_previously_posted_projects = data.iloc[false_positive]['teacher_number_of_previously_posted_projects']

stopwords = set(STOPWORDS)
word_cloud = []
print('#'*50, '\n', 'WORDS IN ESSAYS - FOR FALSE-POSITIVE PREDICTIONS', '\n', '#'*50)
comment_words = ""

for sentence in fp_essay:
    for words in sentence.split():
        word_cloud.append(words.lower())
        
comment_words += " ".join(word_cloud)+" "

wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 
  
# plot the WordCloud image                        
plt.figure(figsize = (10, 10), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show()

In [None]:
import seaborn as sns

plt.figure(figsize=(20,5))

sns.set_theme(style="whitegrid")
sns.boxplot(fp_price)

plt.title("FALSE-POSITIVE PRICE")
plt.legend(["FALSE-POSITIVE PRICE"])
plt.show()

In [None]:
sns.set_style("whitegrid");
plt.figure(figsize=(20,5))

plt.rcParams['axes.titlesize'] = 20
plt.rcParams['axes.titleweight'] = 10

count, bin_edges = np.histogram(fp_teacher_number_of_previously_posted_projects, bins=10, density=True)

nodes_pdf = count / sum(count)
nodes_cdf = np.cumsum(nodes_pdf)

plt.plot(bin_edges[1:],nodes_pdf, color='green', marker='o', linestyle='solid')

plt.title("PDF - FALSE POSITIVE teacher_number_of_previously_posted_projects\n")
plt.legend(['FP - teacher_no.of_prev_posted_projects'])
plt.show();

# **TASK 2:** 

##**`DecisionTree Classifier`**

In [None]:
print(X_train.shape, ytrain.shape, X_test.shape, np.array(ytest).shape) #SET 1 - TFIDF

## **2.1 Non-Zero Feature Importance Feature Selection**

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=2)
model = model.fit(X_train, ytrain)

In [None]:
non_zero_Feature_importance_index = [index for index, value in enumerate(model.feature_importances_.tolist()) if value != float(0.0)]

In [None]:
print(len(non_zero_Feature_importance_index))

X = np.take(X_train, non_zero_Feature_importance_index, axis=1)
X_ = np.take(X_test, non_zero_Feature_importance_index, axis=1)

print(X.shape, X_.shape)

## **2.2 Hyper_Parameter Tuning**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

parameters={'max_depth' : [1, 5, 10, 50, 'None'], 'min_samples_split' : [2, 5, 10, 100, 500]}

gsc=GridSearchCV(estimator=DecisionTreeClassifier(random_state=2),
                    param_grid=parameters, scoring='roc_auc', verbose=1, n_jobs=-1, return_train_score=True)

grid_result = gsc.fit(X, ytrain)

print("#"*50,"\n\n")
best_params=grid_result.best_params_
print(best_params,'\n')
print(grid_result.best_score_,"\n")
print("#"*50,"\n\n")

In [None]:
grid_result.cv_results_

## **2.3 Cross-Validation Result**

In [None]:
params = grid_result.cv_results_['params']
mean_train_score = pd.Series(grid_result.cv_results_['mean_train_score'])
mean_test_score = pd.Series(grid_result.cv_results_['mean_test_score'])

min_samples_split = []
max_depth = []
for parameter in params:
    min_samples_split.append(parameter['min_samples_split'])
    max_depth.append(parameter['max_depth'])

df = pd.DataFrame()
df['min_samples_split'] = pd.Series(min_samples_split)
df['mean_test_score'] = mean_test_score
df['mean_train_score'] = mean_train_score
df['max_depth'] = pd.Series(max_depth)

train_heatmap = df.pivot(index='min_samples_split', columns='max_depth', values='mean_train_score')
test_heatmap = df.pivot(index='min_samples_split', columns='max_depth', values='mean_test_score')

sns.heatmap(train_heatmap, annot=True)
plt.title("Train_Heatmap")
plt.show()

sns.heatmap(test_heatmap, annot=True)
plt.title("Test_Heatmap")
plt.show()

In [None]:
print(X_train.shape, ytrain.shape, X_test.shape, np.array(ytest).shape) #SET 1 - TFIDF

## **2.4 Model Train**

In [None]:
model = DecisionTreeClassifier(random_state=2, max_depth=10, min_samples_split=500)
model.fit(X, ytrain)
Y_pred = model.predict(X_)

## **2.5 ConfusionMatrix**

In [None]:
font = {
'family' : 'DejaVu Sans',
'weight' : 'bold',
'size' : '16'
}

plt.rc('font', **font)
mat = confusion_matrix(ytest, Y_pred)
plot_confusion_matrix(conf_mat=mat, figsize=(5,5), show_normed=True);

## **2.6 AUC-Plot**

In [None]:
from sklearn.metrics import auc

print("train_roc_auc_score : " , roc_auc_score(ytrain, model.predict(X)),'\n')
print("test_roc_auc_score : ", roc_auc_score(ytest, Y_pred), '\n')

probs = model.predict_proba(X)
probs = probs[:, 1]
train_fpr, train_tpr, train_thresholds = roc_curve(ytrain, probs)

probs = model.predict_proba(X_)
probs = probs[:, 1]
test_fpr, test_tpr, test_thresholds = roc_curve(ytest, probs)

print("train_auc_score : " , auc(train_fpr, train_tpr), '\n')
print("test_auc_score : ", auc(test_fpr, test_tpr), '\n')


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="ticks")
sns.set(style='darkgrid')
print("train_auc_score : " , auc(train_fpr, train_tpr), "\n\n")
print("test_auc_score : ", auc(test_fpr, test_tpr), "\n\n")

plt.plot(train_fpr, train_tpr, color='orange', label='_train_ROC')
plt.plot(test_fpr, test_tpr, color='green', label='_test_ROC')
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(['train_AUC', 'test_AUC', 'AUC_Boundary'])
plt.show();

# **`RESULT`**

---


In [None]:
from prettytable import PrettyTable

x = PrettyTable()
x.field_names = ["Vectorizer", "Model", "Hyper_Parameter", "Train_AUC", "Test_AUC"]
x.add_row(["TFIDF-W2V", 'RandomForest', 'max_depth : 50, min_samples_split : 100', 0.93, 0.68])
x.add_row(["TFIDF", 'RandomForest', 'max_depth : 50, min_samples_split : 500', 0.92, 0.71])
x.add_row(["TFIDF", 'DecisionTree', 'max_depth : 10, min_samples_split : 500', 0.69, 0.65])

print(x)