In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np
from glob import glob
import re
import json
import sys

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

from tensorflow import keras
from keras import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
#import stemmer as hindi_stemmer

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
english_stopwords = stopwords.words("english")

with open('final_stopwords.txt', encoding='utf-8') as f:
    hindi_stopwords = f.readlines()
    for i in range(len(hindi_stopwords)):
        hindi_stopwords[i] = re.sub('\n', '', hindi_stopwords[i])

stopword = english_stopwords + hindi_stopwords
english_stemmer = SnowballStemmer("english")

In [None]:
words_dict  = { "तैराक":"तैर",
                "चालाक":"चाल",
                "कूलाक":"कूल",
                "बेलन":"बेल",
                "मिलाप":"मिल",
                "चुपचाप": "चुप",
                "निकास":"निकस",
                "लुकास":"लुक",
                }

In [None]:
df = pd.read_csv("MOLD_train.csv")
df = pd.read_csv("MOLD_test.csv")
df = df.rename(columns={'subtask_a': 'labels1','subtask_b': 'labels2','subtask_c': 'labels3','tweet': 'text'})
df1 = df[['text', 'labels1']]
df1.head()

In [None]:
df2 = df[['text','labels2']].dropna()
df2.head()

In [None]:
df3 = df[['text','labels3']].dropna()
df3.head()

In [None]:
df_text1 = df1[df1['text'].notna()]
df_text2 = df2[df2['text'].notna()]
df_text3 = df3[df3['text'].notna()]
df_text1 = df1.reset_index()
df_text2 = df2.reset_index()
df_text3 = df3.reset_index()
df_text3.tail(10)

In [None]:
df1.loc[df1['labels1'] == 'NONE'] = 'not offensive'
df1['labels1'].value_counts()

In [None]:
df2.loc[df2['labels2'] == 'NONE'] = 'TIN'
df2['labels2'].value_counts()


In [None]:
df3.loc[df3['labels3'] == 'NONE'] = 'IND'
df3['labels3'].value_counts()

In [None]:
tweets = df_text1.text
y = df_text1.labels1

#tweets = df_text2.text
#y = df_text2.labels2

#tweets = df_text3.text
#y = df_text3.labels3

In [None]:
suffixes = {
	    1: ["ो", "े", "ू", "ु", "ी", "ि", "ा"],  
            2: ["तृ","ान","ैत","ने","ाऊ","ाव","कर", "ाओ", "िए", "ाई", "ाए", "नी", "ना", "ते", "ीं", "ती",
                "ता", "ाँ", "ां", "ों", "ें","ीय", "ति","या", "पन", "पा","ित","ीन","लु","यत","वट","लू"],     
            3: ["ेरा","त्व","नीय","ौनी","ौवल","ौती","ौता","ापा","वास","हास","काल","पान","न्त","ौना","सार","पोश","नाक",
                "ियल","ैया", "ौटी","ावा","ाहट","िया","हार", "ाकर", "ाइए", "ाईं", "ाया", "ेगी", "वान", "बीन",
                "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं","कला","िमा","कार",
                "गार", "दान","खोर"],     
            4: ["ावास","कलाप","हारा","तव्य","वैया", "वाला", "ाएगी", "ाएगा", "ाओगी", "ाओगे", 
                "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां",
                "त्वा","तव्य","कल्प","िष्ठ","जादा","क्कड़"],     
            5: ["ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां", "अक्कड़","तव्य:","निष्ठ"],
}

special_suffixes = ["र्", "ज्य","त्य"]
dict_special_suffixes = {"र्":"ृ",
                         "ज्य":"ज्",
                         "त्य":"त्"}

def hi_stem(word, clean=False,chars=None):
    if clean == True:
        word = clean_text(word, chars)
    
    ans = word
    bl = False
    
    if word in words_dict.keys():
        return words_dict[word]
    
    for L in 5, 4, 3, 2, 1:
        if len(word) > L + 1:
            for suf in suffixes[L]:
                if word.endswith(suf):
                    ans = word[:-L]
                    bl =True
        if bl == True:
            break
                    
    if bl == True:
        for suf in suffixes[1]:
            if ans.endswith(suf): 
                # use case - गानेवाला
                ans = hi_stem(ans)
   
    for suf in special_suffixes:
        if ans.endswith(suf):
            l = len(suf)
            ans = ans[:-l]
            ans += dict_special_suffixes[suf]
 
    return ans

def clean_text(text, chars=None):
    if chars == None:        
        text = re.sub(r"[()\"#/@;:<>{}`+=~|!?,']", "", text)
    else:
        text = re.sub(r"[" +chars+ "()\"#/@;:<>{}`+=~|!?,']", "", text)
    return text

In [None]:
regex_for_english_hindi_emojis="[^a-zA-Z#\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF\u0900-\u097F]"
def clean_tweet(tweet, english_stemmer, stopword):
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    tweet = re.sub(regex_for_english_hindi_emojis, ' ', tweet)
    tweet = re.sub("RT ", " ", tweet)
    tweet = re.sub("\n", " ", tweet)
    tweet = re.sub(r" +", " ", tweet)
    tokens = []
    for token in tweet.split():
        if token not in stopword:
            token = english_stemmer.stem(token)
            token = hi_stem(token)
            tokens.append(token)
    return " ".join(tokens)

In [None]:
cleaned_tweets = [clean_tweet(
        tweet, english_stemmer, stopword) for tweet in tweets]

In [None]:
vectorizer = TfidfVectorizer(min_df = 5)
X = vectorizer.fit_transform(cleaned_tweets)
X = X.todense()

# Subtask_a

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=44)

In [None]:
y_train.unique()

## Logistic regression

In [None]:
classifier =LogisticRegression()
classifier.fit(X_train,y_train)


In [None]:
y_pred=classifier.predict(X_val)

In [None]:
print(classification_report(y_val,y_pred))

## Classifier Compression (Comparative Analysis)

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,f1_score,plot_roc_curve,accuracy_score,roc_curve,roc_auc_score,recall_score,log_loss

In [None]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

In [None]:
 for name, clf in zip(names, classifiers):
        clf = make_pipeline(StandardScaler(), clf)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)
        print(clf)
        print(classification_report(y_val, y_pred))

## Neural Network model

In [None]:
le = LabelEncoder()
y_train=le.fit_transform(y_train)
y_val=le.fit_transform(y_val)

In [None]:
model=Sequential(
    [
    Dense(64,activation="relu"),
    Dense(32,activation="relu"),
    Dense(16,activation="relu"),
    Dense(16,activation="relu"),
    Dense(8,activation="relu"),
    Dense(1,activation="sigmoid"),
    ]
)

model.compile('adam',loss='binary_crossentropy',metrics=['accuracy'])

![Screenshot 2023-02-20 224048.jpg](attachment:3c8a22bc-af01-4818-957e-3779832394c1.jpg)

In [None]:
model.fit(X_train, y_train, epochs = 1000, batch_size = 32)

In [None]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.4).astype('int64')
y_pred = y_pred.reshape(len(y_pred))    

In [None]:
print(classification_report(y_val, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_pred,y_val)
print('confusion_matrix\n\n',cm)
print('\n True Positive(TP)= ',cm[0,0])
print('\n True Negative(TN)= ',cm[1,1]) 
print('\n False Positive (FP) = ' , cm[0,1])
print('\n False Negative (FN) = ',cm[1,0])

In [None]:
# visualization confusion metrix with seaborn metrix

import seaborn as sns
cm_matrix=pd.DataFrame(data=cm, columns=['Actual Positive :1', 'Actual Negative :0'],
                       index=['predict PositiveL:1','Predict Negative :0'])

sns.heatmap(cm_matrix, annot =True ,fmt='d',cmap='YlGnBu')

# LightGBM Model

In [None]:
# built the lightgbm model
import lightgbm as lgb
clf =lgb.LGBMClassifier(boosting_type='dart',class_weight=None,colsample_bytree=1.0,
                        importance_type='split',learning_rate=0.1,max_depth=-1,
                        min_child_samples=10,min_child_weight=0.001,min_split_gain=0.0,
                        n_estimators=10,n_jobs=-1,num_leaves=31,objective='binary',
                        random_state=None,reg_alpha=0.0,reg_lambda=0.0,silent=True,
                        subsample=1.0,subsample_for_bin=2000,subsample_freq=0,metric='auc',
                        )
clf.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score
accuracy =accuracy_score(y_pred,y_val)
print('LightGBM Model Accuracy Score : {0:0.8f}'.format(accuracy_score(y_pred,y_val)))

In [None]:
y_pred=clf.predict(X_train)

In [None]:
print('Training-set accuracy score: {0:0.8f}'. format(accuracy_score(y_train, y_pred)))

In [None]:
y_pred=clf.predict(X_val)

In [None]:
# print the score on training and test set
print('training set score: {:.8}'.format(clf.score(X_train,y_train)))
print('Test set score: {:.8f}'.format(clf.score(X_val, y_val)))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_pred, y_val)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
# visualize confusion matrix with seaborn heatmap
import seaborn as sns
cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'], 
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

# subtask_b

In [None]:
tweets = df_text3.text
y = df_text3.labels3

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=50)

## Logistic Regression

In [None]:
classifier =LogisticRegression()
classifier.fit(X_train,y_train)


In [None]:
y_pred=classifier.predict(X_val)

In [None]:
print(classification_report(y_val,y_pred))

## classifier comperssion (compartative Analysis)

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,f1_score,plot_roc_curve,accuracy_score,roc_curve,roc_auc_score,recall_score,log_loss

In [None]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

In [None]:
 for name, clf in zip(names, classifiers):
        clf = make_pipeline(StandardScaler(), clf)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)
        print(clf)
        print(classification_report(y_val, y_pred))

## Neural Network Model

In [None]:
le = LabelEncoder()
y_train=le.fit_transform(y_train)
y_val=le.fit_transform(y_val)

In [None]:
model=Sequential(
    [
    Dense(64,activation="relu"),
    Dense(32,activation="relu"),
    Dense(16,activation="relu"),
    Dense(16,activation="relu"),
    Dense(8,activation="relu"),
    Dense(1,activation="sigmoid"),
    ]
)

model.compile('adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs = 1000, batch_size = 32)

In [None]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.4).astype('int64')
y_pred = y_pred.reshape(len(y_pred))    

In [None]:
print(classification_report(y_val, y_pred))

In [None]:

from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_pred,y_val)
print('confusion_matrix\n\n',cm)
print('\n True Positive(TP)= ',cm[0,0])
print('\n True Negative(TN)= ',cm[1,1]) 
print('\n False Positive (FP) = ' , cm[0,1])
print('\n False Negative (FN) = ',cm[1,0])

In [None]:
# visualize confusion matrix with seaborn heatmap
import seaborn as sns
cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive:1', 'Actual Negative:0'],
                                 index=['Predict Positive:1', 'Predict Negative:0'])

sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')

# Lightgbm model

In [None]:
# built the lightgbm model
import lightgbm as lgb
clf =lgb.LGBMClassifier(boosting_type='dart',class_weight=None,colsample_bytree=1.0,
                        importance_type='split',learning_rate=0.1,max_depth=-1,
                        min_child_samples=10,min_child_weight=0.001,min_split_gain=0.0,
                        n_estimators=10,n_jobs=-1,num_leaves=31,objective='binary',
                        random_state=None,reg_alpha=0.0,reg_lambda=0.0,silent=True,
                        subsample=1.0,subsample_for_bin=2000,subsample_freq=0,metric='auc',
                        )
clf.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score
accuracy =accuracy_score(y_pred,y_val)
print('LightGBM Model Accuracy Score : {0:0.8f}'.format(accuracy_score(y_pred,y_val)))

In [None]:
y_pred=clf.predict(X_train)

In [None]:
print('Training-set accuracy score: {0:0.8f}'. format(accuracy_score(y_train, y_pred)))

In [None]:
y_pred=clf.predict(X_val)

In [None]:
# print the score on training and test set
print('training set score: {:.8}'.format(clf.score(X_train,y_train)))
print('Test set score: {:.8f}'.format(clf.score(X_val, y_val)))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_pred, y_val)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
# visualization confusion metrix with seaborn metrix

import seaborn as sns
cm_matrix=pd.DataFrame(data=cm, columns=['Actual Positive :1', 'Actual Negative :0'],
                       index=['predict PositiveL:1','Predict Negative :0'])

sns.heatmap(cm_matrix, annot =True ,fmt='d',cmap='YlGnBu')

# Subtask_c

In [None]:
tweets = df.text
y = df.labels3

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=50)

## Logistic Regression

In [None]:
classifier =LogisticRegression()
classifier.fit(X_train,y_train)


In [None]:
y_pred=classifier.predict(X_val)

In [None]:
print(classification_report(y_val,y_pred))

## classifier Compression (Comparative Analysis)

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,f1_score,plot_roc_curve,accuracy_score,roc_curve,roc_auc_score,recall_score,log_loss

In [None]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]


In [None]:
 for name, clf in zip(names, classifiers):
        clf = make_pipeline(StandardScaler(), clf)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)
        print(clf)
        print(classification_report(y_val, y_pred))

In [None]:

le = LabelEncoder()
y_train=le.fit_transform(y_train)
y_val=le.fit_transform(y_val)

In [None]:
model=Sequential(
    [
    Dense(64,activation="relu"),
    Dense(32,activation="relu"),
    Dense(16,activation="relu"),
    Dense(16,activation="relu"),
    Dense(8,activation="relu"),
    Dense(1,activation="sigmoid"),
    ]
)

model.compile('adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs = 1000, batch_size = 32)

In [None]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.4).astype('int64')
y_pred = y_pred.reshape(len(y_pred))    

In [None]:
print(classification_report(y_val, y_pred))

In [None]:

from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_pred,y_val)
print('confusion_matrix\n\n',cm)
print('\n True Positive(TP)= ',cm[0,0])
print('\n True Negative(TN)= ',cm[1,1]) 
print('\n False Positive (FP) = ' , cm[0,1])
print('\n False Negative (FN) = ',cm[1,0])

In [None]:
# visualization confusion metrix with seaborn metrix

import seaborn as sns
cm_matrix=pd.DataFrame(data=cm, columns=['Actual Positive :1', 'Actual Negative :0'],
                       index=['predict PositiveL:1','Predict Negative :0'])

sns.heatmap(cm_matrix, annot =True ,fmt='d',cmap='YlGnBu')

## LightGBM model

In [None]:
# built the lightgbm model
import lightgbm as lgb
clf =lgb.LGBMClassifier(boosting_type='dart',class_weight=None,colsample_bytree=1.0,
                        importance_type='split',learning_rate=0.1,max_depth=-1,
                        min_child_samples=10,min_child_weight=0.001,min_split_gain=0.0,
                        n_estimators=10,n_jobs=-1,num_leaves=31,objective='binary',
                        random_state=None,reg_alpha=0.0,reg_lambda=0.0,silent=True,
                        subsample=1.0,subsample_for_bin=2000,subsample_freq=0,metric='auc',
                        )
clf.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score
accuracy =accuracy_score(y_pred,y_val)
print('LightGBM Model Accuracy Score : {0:0.8f}'.format(accuracy_score(y_pred,y_val)))

In [None]:
y_pred=clf.predict(X_train)

In [None]:
print('Training-set accuracy score: {0:0.8f}'. format(accuracy_score(y_train, y_pred)))

In [None]:
y_pred=clf.predict(X_val)

In [None]:
# print the score on training and test set
print('training set score: {:.8}'.format(clf.score(X_train,y_train)))
print('Test set score: {:.8f}'.format(clf.score(X_val, y_val)))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_pred, y_val)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
# visualization confusion metrix with seaborn metrix

import seaborn as sns
cm_matrix=pd.DataFrame(data=cm, columns=['Actual Positive :1', 'Actual Negative :0'],
                       index=['predict PositiveL:1','Predict Negative :0'])

sns.heatmap(cm_matrix, annot =True ,fmt='d',cmap='YlGnBu')