# Verzeo - Major Project (ML-JULY-B1)

### Done by : Sanjay Marreddi  
### Email Id  : sanjay.marreddi.19041@iitgoa.ac.in
    

#### First let us import the required Libraries

In [None]:
import re
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize,sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer,PorterStemmer

from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

%matplotlib inline

#### Let us read the given Dataset

In [None]:
information = pd.read_excel('information.xlsx')

## Exploratory data analysis and Data Cleaning

In [None]:
information.head()

In [None]:
information.shape

In [None]:
information.columns.tolist()

In [None]:
information.describe()

In [None]:
information.info()

In [None]:
information.isnull().sum() # Checking the Missing Values in each Column of the DataFrame

In [None]:
information.corr()

In [None]:
# HaetMap of the Correlation Matrix
sns.heatmap(information.corr())

In [None]:
information.description

In [None]:
information.name

In [None]:
information.text

###### Feature Selection

In [None]:
information =  information[["gender", "name","description", "gender:confidence","text"]]

###### Dropping the rows with NaN 

In [None]:
information = information.dropna()

In [None]:
information.shape

#### Data Cleaning of "description" column

In [None]:
from nltk.corpus import stopwords

In [None]:
list_of_stopwords = list(stopwords.words('english'))

In [None]:
information.shape

In [None]:
information.columns

In [None]:
# Taking only those rows which has type of description str. 
temp=[]
for i in range(information.shape[0]):
    temp.append(type(information['description'].values[i]) == str)

In [None]:
information = information.loc[temp,:]

In [None]:
stemmer = PorterStemmer()
words = stopwords.words("english")
# Data Cleaning using regex.
information['cleaned_description'] = information['description'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

In [None]:
information.head()

#### Data Cleaning of "text" column

In [None]:
information.columns

In [None]:
stemmer = PorterStemmer()
words = stopwords.words("english")
information['cleaned_text'] = information['text'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

In [None]:
information.head()

In [None]:
# Taking only those rows with full confidence in gender
information = information[information["gender:confidence"]==1]

In [None]:
information.head()

In [None]:
X_information =  information[["name","cleaned_text","cleaned_description"]]

y_information = information[["gender"]] # Target Variable

In [None]:
print (X_information.shape)
X_information.head()

In [None]:
print(X_information.shape)
y_information.head()

##### Label Encoding the gender column for ease of Calculation 

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_information['gender_encoded'] = le.fit_transform(y_information["gender"]) 

In [None]:
y_information = y_information[["gender_encoded"]]

In [None]:
print (X_information.shape)
X_information.head()

In [None]:
print(X_information.shape)
y_information.head()

# Ensemble Machine Learning Modelling

## 1. Classification using Naive Bayes Algorithm 

In [None]:
print (X_information.shape)
X_information.head()

In [None]:
print(y_information.shape)
y_information.head()

In [None]:
Train_X_Nb, Test_X_Nb, Train_Y_Nb, Test_Y_Nb = train_test_split(X_information['cleaned_description'],y_information['gender_encoded'],test_size = 0.01, random_state=1)

In [None]:
# Doing Label Encoding for both Training and Testing Sets.
Encoder = LabelEncoder()
Train_Y_Nb = Encoder.fit_transform(Train_Y_Nb)
Test_Y_Nb = Encoder.fit_transform(Test_Y_Nb)

In [None]:
# Transforming text to feature vectors that can be used as input to estimator.
T_vect = TfidfVectorizer(max_features=5000)
T_vect.fit(X_information['cleaned_description'])

In [None]:
# Transforming the Train and Test sets.
Train_X_Nb_T = T_vect.transform(Train_X_Nb)
Test_X_Nb_T = T_vect.transform(Test_X_Nb)

In [None]:
# fit the training dataset on the NB classifier.
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Nb_T,Train_Y_Nb)

In [None]:
# predict the labels on validation dataset.
predictions_Nb = Naive.predict(Test_X_Nb_T)

# Use accuracy_score function to get the accuracy.
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_Nb, Test_Y_Nb)*100)

## 2. Classification using Support Vector Machine Algorithm 

In [None]:
print (X_information.shape)
X_information.head()

In [None]:
print (y_information.shape)
y_information.head()

In [None]:
Train_X_Svm, Test_X_Svm, Train_Y_Svm, Test_Y_Svm = train_test_split(X_information['cleaned_description'],y_information['gender_encoded'],test_size = 0.01, random_state=10)

In [None]:
# Doing Label Encoding for both Training and Testing Sets.
Encoder = LabelEncoder()
Train_Y_Svm = Encoder.fit_transform(Train_Y_Svm)
Test_Y_Svm = Encoder.fit_transform(Test_Y_Svm)

In [None]:
# Transforming text to feature vectors that can be used as input to estimator.
T_vect = TfidfVectorizer(max_features=5000)
T_vect.fit(X_information['cleaned_description'])

In [None]:
# Transforming the Train and Test sets.
Train_X_Svm_T = T_vect.transform(Train_X_Svm)
Test_X_Svm_T = T_vect.transform(Test_X_Svm)

In [None]:
# fit the training dataset on the classifier
SVM = svm.SVC(C=2.0, kernel='linear', degree=2, gamma='auto')
SVM.fit(Train_X_Svm_T,Train_Y_Svm)

In [None]:
# predict the labels on validation dataset
predictions_Svm = SVM.predict(Test_X_Svm_T)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_Svm, Test_Y_Svm)*100)

## 3. Classification using Logistic Regression Algorithm 

In [None]:
print (X_information.shape)
X_information.head()

In [None]:
print(y_information.shape)
y_information.head()

In [None]:
Train_X_Lr, Test_X_Lr, Train_Y_Lr, Test_Y_Lr = train_test_split(X_information['cleaned_description'],y_information['gender_encoded'],test_size = 0.01, random_state=22)

In [None]:
# Doing Label Encoding for both Training and Testing Sets.
Encoder = LabelEncoder()
Train_Y_Lr = Encoder.fit_transform(Train_Y_Lr)
Test_Y_Lr = Encoder.fit_transform(Test_Y_Lr)

In [None]:
# Transforming text to feature vectors that can be used as input to estimator.
T_vect = TfidfVectorizer(max_features=5000)
T_vect.fit(X_information['cleaned_description'])

In [None]:
# Transforming the Train and Test sets.
Train_X_Lr_T = T_vect.transform(Train_X_Lr)
Test_X_Lr_T = T_vect.transform(Test_X_Lr)

In [None]:
# fit the training dataset on the classifier
lr = LogisticRegression(max_iter = 1000)
lr.fit(Train_X_Lr_T,Train_Y_Lr)

In [None]:
# predict the labels on validation dataset
predictions_Lr = lr.predict(Test_X_Lr_T)

# Use accuracy_score function to get the accuracy
print("LR Accuracy Score -> ",accuracy_score(predictions_Lr, Test_Y_Lr)*100)

### Ensemble Machine Learning Modelling -> Choosing best Algo based on Accuracy

###### From above Models, Accuracy obtained through

1. Naive Bayes Algorithm           is **71.42857142857143 %**

2. Support Vector Machine Algorithm is **68.0672268907563 %**

3. Logistic Regression Algorithm is  **70.58823529411765 %**




###### So, For the given data set and the used hyperparameters, Based on the accuracy score , We can say **Naive Bias Algorithm is best** in this case.

# Questions on DataSet 

###### 1.What are the most common emotions/words used by Males and Females?

In [None]:
information = pd.read_excel('information.xlsx')

information =  information[["gender", "name","description", "gender:confidence","text"]]

information = information.dropna()

# Taking only those rows which has type of description str. 
temp=[]
for i in range(information.shape[0]):
    temp.append(type(information['description'].values[i]) == str)
    
information = information.loc[temp,:]

stemmer = PorterStemmer()
words = stopwords.words("english")
# Data Cleaning using regex.
information['cleaned_description'] = information['description'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())


stemmer = PorterStemmer()
words = stopwords.words("english")
information['cleaned_text'] = information['text'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

information = information[information["gender:confidence"]==1]


X_information =  information[["gender", "name","cleaned_text","cleaned_description"]]


In [None]:
X_information.gender.value_counts()

In [None]:
X_info = X_information.loc[ (X_information.gender == "female" ) | (X_information.gender == "male" ) ] 

In [None]:
X_info