<a href="https://colab.research.google.com/github/Paul-mwaura/Natural-Language-Processing/blob/main/Gender_Based_violence_NLP_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import NuSVC, SVC
import pickle

In [None]:
data_url = 'training data.xlsx'
df = pd.read_excel(data_url)
df.head()

Unnamed: 0,labels,data
0,assault,"he punched me in the back of the fuckin’ head,..."
1,assault,"I’ve managed to gain my feet, um still trying ..."
2,assault,I started screaming ‘Help! Help!’ and the male...
3,assault,"He just dropped the jumpers onto the ground, t..."
4,assault,He tried to assault me and my attempt was to g...


In [None]:
df.shape

(101, 2)

In [None]:
def prepareData(data_url, testSize):
    data = df.copy()
    data = data.loc[data['labels'].isin(['assault', 'sexual abuse'])]
    X = data['data']
    y = data['labels']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testSize)
    return X_train, X_test, y_train, y_test, X, y

def makeModel(classifier, X_train, y_train):

    pipeline = Pipeline([
        ('bow', CountVectorizer()),  # strings to token integer counts
        ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
        ('classifier', classifier),  # train on TF-IDF vectors with  classifier
    ])

    pipeline.fit(X_train, y_train)
    return pipeline

def testPerformance (model, model_type, X_test, y_test):
    print("results with "+model_type+" classifier: \n")
    print("confusion matrix: \n", confusion_matrix(y_test, model.predict(X_test)))

    print("\n"+classification_report(y_test, model.predict(X_test)))


#prepare training and testing data
data_url = "C:/Users/Ashley/Dropbox/cs109/SML Chatbot Project/training data.xlsx"
X_train, X_test, y_train, y_test, X, y = prepareData(data_url, testSize=0.2)

#######################
# testing the different classifiers
########################

#random forrest
print("Random Forest")
model = makeModel(RandomForestClassifier(), X_train, y_train)
testPerformance(model, "random forest", X_test, y_test)
print("\ncross validation: ")
ranforest = Pipeline([
            ('bow', CountVectorizer()),  # strings to token integer counts
            ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
            ('classifier', RandomForestClassifier()),  # train on TF-IDF vectors with  classifier
            ])
r = cross_val_score(ranforest, X_train, y_train, cv=3, scoring='accuracy')
print("mean accuracy: "+str(np.mean(r))+"\nstandard error: "+str(np.std(r)))

#New Support Vector Machine
print("\nNew Support Vector Machine")
model_1 = makeModel(NuSVC(), X_train, y_train)
testPerformance(model_1, "Suport Vector Machine", X_test, y_test)
print("\ncross validation: ")
NuSupport = Pipeline([
            ('bow', CountVectorizer()),  # strings to token integer counts
            ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
            ('classifier', NuSVC()),  # train on TF-IDF vectors with  classifier
            ])
s = cross_val_score(NuSupport, X_train, y_train, cv=3, scoring='accuracy')
print("mean accuracy: "+str(np.mean(s))+"\nstandard error:"+str(np.std(s)))

#Support vector machine
print("\nSupport Vector Machine")
model_2 = makeModel(SVC(), X_train, y_train)
testPerformance(model_2, "Suport Vector Machine", X_test, y_test)
print("\ncross validation: ")
Support = Pipeline([
            ('bow', CountVectorizer()),  # strings to token integer counts
            ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
            ('classifier', SVC()),  # train on TF-IDF vectors with  classifier
            ])
s = cross_val_score(Support, X_train, y_train, cv=3, scoring='accuracy')
print("mean accuracy: "+str(np.mean(s))+"\nstandard error:"+str(np.std(s)))

#Multi Naive Bayes
print("\nMultinomial Naive Bayes")
model_3 = makeModel(MultinomialNB(), X_train, y_train)
testPerformance(model_3, "MultinomialNB", X_test, y_test)
print("\ncross validation: ")
Bayes = Pipeline([
            ('bow', CountVectorizer()),  # strings to token integer counts
            ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
            ('classifier', MultinomialNB()),  # train on TF-IDF vectors with  classifier
            ])
n = cross_val_score(Bayes, X_train, y_train, cv=3, scoring='accuracy')
print("mean accuracy: "+str(np.mean(n))+"\nstandard error: "+str(np.std(n)))


Random Forest
results with random forest classifier: 

confusion matrix: 
 [[5 0]
 [1 6]]

              precision    recall  f1-score   support

     assault       0.83      1.00      0.91         5
sexual abuse       1.00      0.86      0.92         7

    accuracy                           0.92        12
   macro avg       0.92      0.93      0.92        12
weighted avg       0.93      0.92      0.92        12


cross validation: 
mean accuracy: 0.8291666666666666
standard error: 0.03280836614171587

New Support Vector Machine
results with Suport Vector Machine classifier: 

confusion matrix: 
 [[4 1]
 [0 7]]

              precision    recall  f1-score   support

     assault       1.00      0.80      0.89         5
sexual abuse       0.88      1.00      0.93         7

    accuracy                           0.92        12
   macro avg       0.94      0.90      0.91        12
weighted avg       0.93      0.92      0.91        12


cross validation: 
mean accuracy: 0.829166666666666

In [None]:
# Saving the classifiers
pickle.dump(ranforest, open("randomForest.p","wb"))
print("\nrandom forest saved")
# pickle.dump(NewSVC, open("NewSVC.p","wb"))
# print("\nsupport vector saved")
# pickle.dump(MultiNB, open("MultiNB.p","wb"))
# print("\nnaive bayes saved")


random forest saved
