<a href="https://colab.research.google.com/github/Paul-mwaura/Gender-Based-Discrimination-NLP/blob/main/GBV_Sentiment_Analysis_Modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
import tensorflow as tf

In [None]:
df = pd.read_csv("/content/GBV_data_clean_v04.csv")
df.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,username,new_tweets,clean_tweets,tweets_without_stopwords,replies,retweets,likes,pol_nltk,nltk_sentiment
0,12,12,2keercous,Didn't he sxually assault a woman,didnt he sxually assault a woman,didnt sxually assault woman,0,0,0,"{'neg': 0.0, 'neu': 0.494, 'pos': 0.506}",pos
1,26,27,imshanereaction,And for the love of God please no more pictur...,and for the love of god please no more pictur...,love god please pictures man groping woman lik...,1,0,14,"{'neg': 0.0, 'neu': 0.419, 'pos': 0.581}",pos
2,64,65,preconciliatio1,. lol,lol,lol,0,0,0,"{'neg': 0.0, 'neu': 0.0, 'pos': 1.0}",pos


In [None]:
df = df[['tweets_without_stopwords', 'nltk_sentiment']]
df.head()

Unnamed: 0,tweets_without_stopwords,nltk_sentiment
0,didnt sxually assault woman,pos
1,love god please pictures man groping woman lik...,pos
2,lol,pos
3,agree cant attack assault someone exactly woma...,pos
4,dm ok said seem like lovely man lets chat,pos


In [None]:
df.isna().sum()

tweets_without_stopwords    0
nltk_sentiment              0
dtype: int64

In [None]:
df.duplicated().sum()

96

In [None]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [None]:
df.groupby(['nltk_sentiment'])['nltk_sentiment'].count().sort_values(ascending=True)

nltk_sentiment
neg    1320
pos    1350
neu    1365
Name: nltk_sentiment, dtype: int64

### Split the data into Features and Labels

In [None]:
# split the dataset into training and validation datasets 
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(df['tweets_without_stopwords'], df['nltk_sentiment'])

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_valid = encoder.fit_transform(y_valid)

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, y_valid)

### Count Vectors as features
Count Vector is a matrix notation of the dataset in which every row represents a document from the corpus, every column represents a term from the corpus, and every cell represents the frequency count of a particular term in a particular document.

In [None]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(df['tweets_without_stopwords'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(X_train)
xvalid_count =  count_vect.transform(X_valid)

### TF-IDF Vectors as features
>>
* TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)
* IDF(t) = log_e(Total number of documents / Number of documents with term t in it)

>>
TF-IDF Vectors can be generated at different levels of input tokens (words, characters, n-grams)
* a. Word Level TF-IDF : Matrix representing tf-idf scores of every term in different documents
* b. N-gram Level TF-IDF : N-grams are the combination of N terms together. This Matrix representing tf-idf scores of N-grams
* c. Character Level TF-IDF : Matrix representing tf-idf scores of character level n-grams in the corpus

In [None]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(df['tweets_without_stopwords'])
xtrain_tfidf =  tfidf_vect.transform(X_train)
xvalid_tfidf =  tfidf_vect.transform(X_valid)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(df['tweets_without_stopwords'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(X_train)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(X_valid)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(df['tweets_without_stopwords'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_train) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(X_valid) 

  "The parameter 'token_pattern' will not be used"


## Model Building

In [None]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, y_valid)

### Naive Bayes
Naive Bayes is a classification technique based on Bayes’ Theorem with an assumption of independence among predictors. 

In [None]:
# Naive Bayes
print("Naive Bayes\n")
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, y_train, xvalid_count)
print("Count Vectors: ", accuracy*100)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, y_train, xvalid_tfidf)
print("WordLevel TF-IDF: ", accuracy*100)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, y_train, xvalid_tfidf_ngram)
print("N-Gram Vectors: ", accuracy*100)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, y_train, xvalid_tfidf_ngram_chars)
print("CharLevel Vectors: ", accuracy*100)

Naive Bayes

Count Vectors:  71.55599603567889
WordLevel TF-IDF:  75.02477700693755
N-Gram Vectors:  56.987115956392465
CharLevel Vectors:  67.78989098116948


### Linear Classifier (Logistic Regression)
Logistic regression measures the relationship between the categorical dependent variable and one or more independent variables by estimating probabilities using a logistic/sigmoid function.

In [None]:
# Linear Regression 
print("Linear Regression\n")

lr = linear_model.LogisticRegression()

# Linear Classifier on Count Vectors
accuracy = train_model(lr, xtrain_count, y_train, xvalid_count)
print("Count Vectors: ", accuracy*100)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(lr, xtrain_tfidf_ngram, y_train, xvalid_tfidf_ngram)
print("N-Gram Vectors: ", accuracy*100)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(lr, xtrain_tfidf_ngram_chars, y_train, xvalid_tfidf_ngram_chars)
print("CharLevel Vectors: ", accuracy*100)

Linear Regression

Count Vectors:  78.49355797819624
N-Gram Vectors:  57.97819623389494
CharLevel Vectors:  73.6372646184341


## Save Model

### TIdf Model

In [None]:
import pickle, joblib
# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(lr, xtrain_tfidf, y_train, xvalid_tfidf)
print("WordLevel TF-IDF: ", accuracy*100)

# Save the model as Pickle File
with open('tfidf_model.pickle', 'wb') as handle:
    pickle.dump(lr, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save the model as Joblib file
filename = 'tfidf_model.joblib'
joblib.dump(lr, filename)

WordLevel TF-IDF:  78.09712586719525


['tfidf_model.joblib']

### Implementing a SVM Model
Support Vector Machine (SVM) is a supervised machine learning algorithm which can be used for both classification or regression challenges. The model extracts a best possible hyper-plane / line that segregates the two classes.

In [None]:
# SVM on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, y_train, xvalid_tfidf_ngram)
print("SVM, N-Gram Vectors: ", accuracy*100)

SVM, N-Gram Vectors:  55.50049554013875


### Bagging Model (Random Forest Model)

In [None]:
# Random Forest Model
print("Random Forest Model")
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, y_train, xvalid_count)
print("Count Vectors: ", accuracy*100)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, y_train, xvalid_tfidf)
print("WordLevel TF-IDF: ", accuracy*100)

Random Forest Model
Count Vectors:  68.18632309217047
WordLevel TF-IDF:  68.97918731417245


### Boosting Model (Xtreme Gradient Boosting Model)
Boosting is a machine learning ensemble meta-algorithm for primarily reducing bias, and also variance in supervised learning, and a family of machine learning algorithms that convert weak learners to strong ones. 

A weak learner is defined to be a classifier that is only slightly correlated with the true classification

In [None]:
# XGB Model
print("XGB Model\n")
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), y_train, xvalid_count.tocsc())
print("Count Vectors: ", accuracy*100)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), y_train, xvalid_tfidf.tocsc())
print("WordLevel TF-IDF: ", accuracy*100)

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), y_train, xvalid_tfidf_ngram_chars.tocsc())
print("CharLevel Vectors: ", accuracy*100)

XGB Model

Count Vectors:  66.10505450941527
WordLevel TF-IDF:  67.29435084241824
CharLevel Vectors:  67.29435084241824
