In [3]:
import re
import nltk
import numpy as np
import pandas as pd
#import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

In [3]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [4]:
df = pd.read_csv('Bangla-Abbusive.csv', encoding = "ISO-8859-1")

In [5]:
df.head()

Unnamed: 0,Bangla,English,Reason,Categry,Label
0,????? ???? ????? ????? ???? ??? ??? ???? ?????...,Islam cannot be a religion that has to be save...,Hate towards religion,Religious,Yes
1,????? ???????? ???????? ????????,Shamim Patwari Congratulations Bangladesh,Normal Sentence,Sports,No
2,"???????? ???????? ??????? ?????, ?????? ??? ??...","Congratulations to Bangladesh cricket team, we...",Normal Sentence,Sports,No
3,"?????? ??????????, ???????, ????????? ????? ??...","Please do not catch rickshaw pullers, day labo...",Normal Sentence,Other,No
4,??? ???? ????? ?????????? ????? ??????? ????,It was very painful to see the rickshaw pulle...,Normal Sentence,Other,No


In [6]:
df.shape

(299, 5)

In [7]:
df['Categry'].unique()

array(['Religious', 'Sports', 'Other', 'Personal', 'Geopolitical',
       'Political', 'Gender', 'Sports ', 'Reliigious', 'Others',
       'Personal '], dtype=object)

In [8]:
df.isnull().sum()

Bangla     0
English    1
Reason     1
Categry    0
Label      0
dtype: int64

In [9]:
df.dropna(inplace = True)

In [10]:
df.isnull().sum()

Bangla     0
English    0
Reason     0
Categry    0
Label      0
dtype: int64

In [11]:
df.shape

(297, 5)

In [12]:
df['Combine'] = df.English.str.cat(df['Reason']," ")

In [13]:
df['Combine'][10]

'May the personal security of our very popular, best young MP of the country not be disturbed. That too needs to be seen. Normal Sentence'

In [14]:
#Text Preprocessing
lemma = WordNetLemmatizer()
def textpreprocess(text):
    Comment = re.sub('[^a-zA-Z]', ' ', text)
    Comment = Comment.lower()
    Comment = Comment.split()
    Comment = [lemma.lemmatize(word) for word in Comment if word not in set(stopwords.words('english'))]
    Comment = ' '.join(Comment)
    
    return Comment

In [15]:
textpreprocess('£££@=@/// 100th 200 Did you taste the pasta that served yesterday')

'th taste pasta served yesterday'

In [16]:
df['English'] = df['English'].apply(textpreprocess)

In [17]:
df['English'][10]

'may personal security popular best young mp country disturbed need seen'

In [18]:
df['Label'].unique()

array(['Yes', 'No'], dtype=object)

In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Label'] = le.fit_transform(df['Label'])

In [20]:
df['Label'].unique()

array([1, 0])

In [21]:
df.head()

Unnamed: 0,Bangla,English,Reason,Categry,Label,Combine
0,????? ???? ????? ????? ???? ??? ??? ???? ?????...,islam cannot religion saved killing people,Hate towards religion,Religious,1,Islam cannot be a religion that has to be save...
1,????? ???????? ???????? ????????,shamim patwari congratulation bangladesh,Normal Sentence,Sports,0,Shamim Patwari Congratulations Bangladesh Norm...
2,"???????? ???????? ??????? ?????, ?????? ??? ??...",congratulation bangladesh cricket team play ma...,Normal Sentence,Sports,0,"Congratulations to Bangladesh cricket team, we..."
3,"?????? ??????????, ???????, ????????? ????? ??...",please catch rickshaw puller day laborer worke...,Normal Sentence,Other,0,"Please do not catch rickshaw pullers, day labo..."
4,??? ???? ????? ?????????? ????? ??????? ????,painful see rickshaw puller uncle screaming,Normal Sentence,Other,0,It was very painful to see the rickshaw pulle...


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
Vector = TfidfVectorizer()
X = Vector.fit_transform(df['English'])


In [23]:
#Vector.get_feature_names()[:50]

In [24]:

y = df['Label']

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Model Building

In [26]:
model = list()
Accuracy = list()
precision = list()
recall = list()
F1score = list()


In [27]:
def test_eval(clf_model, X_test, y_test, algo=None):
    # Test set prediction
    #y_prob=clf_model.predict_proba(X_test)
    y_pred=clf_model.predict(X_test)

    print('Confusion Matrix')
    print('='*60)
    print(confusion_matrix(y_test,y_pred),"\n")
    print('Classification Report')
    print('='*60)
    print(classification_report(y_test,y_pred),"\n")
    print('Accuracy Score')
    print('='*60)
    print(clf_model.score(X_test, y_test))
          
    model.append(algo)
    Accuracy.append(clf_model.score(X_test, y_test))
    precision.append(precision_score(y_test,y_pred))
    recall.append(recall_score(y_test,y_pred))
    F1score.append(f1_score(y_test,y_pred))
   
    


# Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression
log_model=LogisticRegression()

params={'C':np.logspace( -10, 1, 15),'class_weight':[None,'balanced'],'penalty':['l1','l2']}

cv = StratifiedKFold(n_splits=5, random_state=100, shuffle=True)

# Create grid search using 5-fold cross validation
clf_LR = GridSearchCV(log_model, params, cv=cv, scoring='roc_auc', n_jobs=-1)
clf_LR.fit(X_train, y_train)
clf_LR.best_estimator_

        nan 0.9311192         nan 0.9311192         nan 0.9311192
        nan 0.93064073        nan 0.9311192         nan 0.93064073
        nan 0.9311192         nan 0.9311192         nan 0.9311192
        nan 0.9311192         nan 0.9311192         nan 0.9311192
        nan 0.9311192         nan 0.9311192         nan 0.9311192
        nan 0.9311192         nan 0.9311192         nan 0.93064073
        nan 0.93064073        nan 0.93109528        nan 0.93109528
        nan 0.93248908        nan 0.93248908        nan 0.93428334
        nan 0.93428334        nan 0.93385271        nan 0.93385271]


LogisticRegression(C=1.6378937069540613)

In [29]:
test_eval(clf_LR, X_test, y_test, 'Logistic Regression')

Confusion Matrix
[[31 20]
 [ 4 35]] 

Classification Report
              precision    recall  f1-score   support

           0       0.89      0.61      0.72        51
           1       0.64      0.90      0.74        39

    accuracy                           0.73        90
   macro avg       0.76      0.75      0.73        90
weighted avg       0.78      0.73      0.73        90
 

Accuracy Score
0.9069884364002011


In [30]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
lr.score(X_test, y_test)

0.6888888888888889

# 2.MultinomialNB

In [31]:
from sklearn.naive_bayes import MultinomialNB
clf_MB = MultinomialNB()
clf_MB.fit(X_train,y_train)
test_eval(clf_MB, X_test, y_test, 'MultinomialNB')

Confusion Matrix
[[32 19]
 [ 4 35]] 

Classification Report
              precision    recall  f1-score   support

           0       0.89      0.63      0.74        51
           1       0.65      0.90      0.75        39

    accuracy                           0.74        90
   macro avg       0.77      0.76      0.74        90
weighted avg       0.78      0.74      0.74        90
 

Accuracy Score
0.7444444444444445


# 3.Decision Tree

In [32]:
estimators = [2,10,30,50,100]
# Maximum number of depth in each tree:
max_depth = [i for i in range(5,16,2)]
# Minimum number of samples to consider to split a node:
min_samples_split = [2, 5, 10, 15, 20, 50, 100]
# Minimum number of samples to consider at each leaf node:
min_samples_leaf = [1, 2, 5]
#Impurity
criterion = ['gini', 'entropy']
#The number of features to consider when looking for the best split
max_features = ['log2', 'sqrt', 'auto']


In [33]:
from sklearn.tree import DecisionTreeClassifier
tree_model = DecisionTreeClassifier()
tree_param_grid = { 
    'max_features':max_features,
    'criterion':criterion,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
}

clf_DT = RandomizedSearchCV(tree_model, tree_param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=2)
clf_DT.fit(X_train, y_train)
clf_DT.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


DecisionTreeClassifier(max_depth=11, max_features='sqrt', min_samples_split=20)

In [34]:
test_eval(clf_DT, X_test, y_test, 'Decision Tree')

Confusion Matrix
[[ 3 48]
 [ 0 39]] 

Classification Report
              precision    recall  f1-score   support

           0       1.00      0.06      0.11        51
           1       0.45      1.00      0.62        39

    accuracy                           0.47        90
   macro avg       0.72      0.53      0.37        90
weighted avg       0.76      0.47      0.33        90
 

Accuracy Score
0.5460030165912518


In [35]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

DecisionTreeClassifier()

In [36]:
dt.score(X_test, y_test)

0.6444444444444445

# 4.Random forest CLassifier

In [37]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier()

rf_params={'n_estimators':estimators,
           'max_features':max_features,
           'criterion':criterion,
           'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf}

clf_RF = RandomizedSearchCV(rf_model, rf_params, cv=cv, scoring='roc_auc', n_jobs=-1, n_iter=20, verbose=2)
clf_RF.fit(X_train, y_train)
clf_RF.best_estimator_

Fitting 5 folds for each of 20 candidates, totalling 100 fits


RandomForestClassifier(max_depth=15)

In [38]:
test_eval(clf_RF, X_test, y_test, 'Random Forest')

Confusion Matrix
[[18 33]
 [ 2 37]] 

Classification Report
              precision    recall  f1-score   support

           0       0.90      0.35      0.51        51
           1       0.53      0.95      0.68        39

    accuracy                           0.61        90
   macro avg       0.71      0.65      0.59        90
weighted avg       0.74      0.61      0.58        90
 

Accuracy Score
0.8838612368024134


# 5.Support Vector Machine

In [39]:
from sklearn.svm import SVC
clf_SVC = SVC(kernel = 'sigmoid', gamma = 1.0)
clf_SVC.fit(X_train, y_train)
test_eval(clf_SVC, X_test, y_test, 'Support Vector Machine')

Confusion Matrix
[[35 16]
 [ 5 34]] 

Classification Report
              precision    recall  f1-score   support

           0       0.88      0.69      0.77        51
           1       0.68      0.87      0.76        39

    accuracy                           0.77        90
   macro avg       0.78      0.78      0.77        90
weighted avg       0.79      0.77      0.77        90
 

Accuracy Score
0.7666666666666667


# 6.AdaBoostClassifier

In [40]:
from sklearn.ensemble import AdaBoostClassifier
clf_ada=AdaBoostClassifier()
clf_ada.fit(X_train, y_train)
test_eval(clf_ada, X_test, y_test, 'AdaBoost')

Confusion Matrix
[[25 26]
 [ 3 36]] 

Classification Report
              precision    recall  f1-score   support

           0       0.89      0.49      0.63        51
           1       0.58      0.92      0.71        39

    accuracy                           0.68        90
   macro avg       0.74      0.71      0.67        90
weighted avg       0.76      0.68      0.67        90
 

Accuracy Score
0.6777777777777778


# 7.GradientBoostingClassifier

In [41]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier()

gb_params = { 
    "n_estimators":[1,3,5,10,15,20,30,40,50,],
    'learning_rate': [0.1, 0.05, 0.02, 0.01],
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
}

clf_gb=RandomizedSearchCV(gb_model,gb_params,cv=cv, scoring='roc_auc',n_jobs=1)

clf_gb.fit(X_train, y_train)
clf_gb.best_estimator_



GradientBoostingClassifier(learning_rate=0.05, max_depth=15, min_samples_leaf=2,
                           min_samples_split=20, n_estimators=50)

In [42]:
test_eval(clf_gb, X_test, y_test, 'GradientBoost')

Confusion Matrix
[[21 30]
 [ 2 37]] 

Classification Report
              precision    recall  f1-score   support

           0       0.91      0.41      0.57        51
           1       0.55      0.95      0.70        39

    accuracy                           0.64        90
   macro avg       0.73      0.68      0.63        90
weighted avg       0.76      0.64      0.62        90
 

Accuracy Score
0.8619909502262444


# 8.SGDClassifier

In [43]:
from sklearn.linear_model import SGDClassifier
sgd_params = {
    "loss" : [ "log", "squared_hinge", "modified_huber"],
    "alpha" : [0.0001, 0.001, 0.01, 0.1],
    "penalty" : ["l2", "l1", "none"],
}

sgd_model = SGDClassifier()
clf_sgd=RandomizedSearchCV(sgd_model,sgd_params,cv=cv, scoring='roc_auc',n_jobs=1)

clf_sgd.fit(X_train, y_train)
clf_sgd.best_estimator_


SGDClassifier(alpha=0.1, loss='modified_huber')

In [44]:
test_eval(clf_sgd, X_test, y_test, 'SGDClassifier')

Confusion Matrix
[[21 30]
 [ 2 37]] 

Classification Report
              precision    recall  f1-score   support

           0       0.91      0.41      0.57        51
           1       0.55      0.95      0.70        39

    accuracy                           0.64        90
   macro avg       0.73      0.68      0.63        90
weighted avg       0.76      0.64      0.62        90
 

Accuracy Score
0.8979386626445449


# 9.ExtraTreesClassifier

In [45]:
from sklearn.ensemble import ExtraTreesClassifier
clf_ETC = ExtraTreesClassifier()
clf_ETC.fit(X_train, y_train)
test_eval(clf_ETC, X_test, y_test, 'ExtraTreesClassifier')

Confusion Matrix
[[21 30]
 [ 2 37]] 

Classification Report
              precision    recall  f1-score   support

           0       0.91      0.41      0.57        51
           1       0.55      0.95      0.70        39

    accuracy                           0.64        90
   macro avg       0.73      0.68      0.63        90
weighted avg       0.76      0.64      0.62        90
 

Accuracy Score
0.6444444444444445


# 10.KNeighbours Classifier

In [46]:
from sklearn.neighbors import KNeighborsClassifier
clf_KNC = KNeighborsClassifier()
clf_KNC.fit(X_train, y_train)
test_eval(clf_KNC, X_test, y_test, 'KNeighbours')

Confusion Matrix
[[37 14]
 [ 8 31]] 

Classification Report
              precision    recall  f1-score   support

           0       0.82      0.73      0.77        51
           1       0.69      0.79      0.74        39

    accuracy                           0.76        90
   macro avg       0.76      0.76      0.75        90
weighted avg       0.76      0.76      0.76        90
 

Accuracy Score
0.7555555555555555


# 11.MLPClassifier

In [47]:
from sklearn.neural_network import MLPClassifier
clf_mlp  = MLPClassifier()
clf_mlp.fit(X_train, y_train)
test_eval(clf_mlp, X_test, y_test, 'MLPClassifier')

Confusion Matrix
[[35 16]
 [ 4 35]] 

Classification Report
              precision    recall  f1-score   support

           0       0.90      0.69      0.78        51
           1       0.69      0.90      0.78        39

    accuracy                           0.78        90
   macro avg       0.79      0.79      0.78        90
weighted avg       0.81      0.78      0.78        90
 

Accuracy Score
0.7777777777777778


In [48]:
Accuracy

[0.9069884364002011,
 0.7444444444444445,
 0.5460030165912518,
 0.8838612368024134,
 0.7666666666666667,
 0.6777777777777778,
 0.8619909502262444,
 0.8979386626445449,
 0.6444444444444445,
 0.7555555555555555,
 0.7777777777777778]

In [49]:
clf_eval_df = pd.DataFrame({'Model':model,
                            'Accuracy':Accuracy,
                            'Precision':precision,
                            'Recall':recall,
                            'F1-score':F1score,
                            })

In [50]:
clf_eval_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score
0,Logistic Regression,0.906988,0.636364,0.897436,0.744681
1,MultinomialNB,0.744444,0.648148,0.897436,0.752688
2,Decision Tree,0.546003,0.448276,1.0,0.619048
3,Random Forest,0.883861,0.528571,0.948718,0.678899
4,Support Vector Machine,0.766667,0.68,0.871795,0.764045
5,AdaBoost,0.677778,0.580645,0.923077,0.712871
6,GradientBoost,0.861991,0.552239,0.948718,0.698113
7,SGDClassifier,0.897939,0.552239,0.948718,0.698113
8,ExtraTreesClassifier,0.644444,0.552239,0.948718,0.698113
9,KNeighbours,0.755556,0.688889,0.794872,0.738095


In [51]:
#clf_eval_df.to_csv('Bigram.csv')

In [53]:
import pickle
pickle.dump(clf_mlp, open('model-mlp.pkl', 'wb'))

In [54]:
pickle.dump(Vector,open('vectorizer.pkl', 'wb'))