In [None]:
# !unzip "/content/finalzip.zip"inmu
# !unzip "/content/finalzip.zip" -d "/"

### Importing Libraries and initializing stopwords and stemmer

In [None]:
import pandas as pd
import numpy as np
from glob import glob
import re
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout


from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import stemmer as hindi_stemmer
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
english_stopwords = stopwords.words("english")
with open('final_stopwords.txt', encoding = 'utf-8') as f:
    hindi_stopwords = f.readlines()
    for i in range(len(hindi_stopwords)):
        hindi_stopwords[i] = re.sub('\n','',hindi_stopwords[i])
stopwords = english_stopwords + hindi_stopwords
english_stemmer = SnowballStemmer("english")

## Reading Data

In [None]:
from glob import glob
train_directories = []
for i in glob("/content/finalzip/Hinglish/*/"):
    for j in glob(i+'*/'):
        train_directories.append(j)

train_directories

['/content/finalzip/Hinglish/islamophobia/1534043530368552960/',
 '/content/finalzip/Hinglish/islamophobia/1457992274353160192/',
 '/content/finalzip/Hinglish/islamophobia/1427127023047114756/',
 '/content/finalzip/Hinglish/islamophobia/1533742495426674688/',
 '/content/finalzip/Hinglish/islamophobia/1427132613525852161/',
 '/content/finalzip/Hinglish/islamophobia/1534698276540542976/',
 '/content/finalzip/Hinglish/islamophobia/1427164007366942722/',
 '/content/finalzip/Hinglish/islamophobia/1443896879461244936/',
 '/content/finalzip/Hinglish/islamophobia/1460517156630663169/',
 '/content/finalzip/Hinglish/islamophobia/1479780170265288706/',
 '/content/finalzip/Hinglish/islamophobia/1457070488669376518/',
 '/content/finalzip/Hinglish/islamophobia/1460523326384709637/',
 '/content/finalzip/Hinglish/islamophobia/1534125155236073474/',
 '/content/finalzip/Hinglish/islamophobia/1467510184142524421/',
 '/content/finalzip/Hinglish/islamophobia/1486934002816540685/',
 '/content/finalzip/Hingl

In [None]:
data = []
for i in train_directories:
    with open(i+'data.json', encoding='utf-8') as f:
        data.append(json.load(f))
labels = []
for i in train_directories:
    with open(i+'binary_labels.json', encoding='utf-8') as f:
        labels.append(json.load(f))

In [None]:

def tr_flatten(d,l):
    flat_text = []
    flat_text.append({
        'tweet_id':d['tweet_id'],
        'text':d['tweet'],
        'label':l[d['tweet_id']]
    })

    for i in d['comments']:
            flat_text.append({
                'tweet_id':i['tweet_id'],
                'text':flat_text[0]['text'] +' '+i['tweet'], #flattening comments(appending one after the other)
                'label':l[i['tweet_id']]
            })
            if 'replies' in i.keys():
                for j in i['replies']:
                    flat_text.append({
                        'tweet_id':j['tweet_id'],
                        'text':flat_text[0]['text'] +' '+ i['tweet'] +' '+ j['tweet'], #flattening replies
                        'label':l[j['tweet_id']]
                    })
    return flat_text

def te_flatten(d):
    flat_text = []
    flat_text.append({
        'tweet_id':d['tweet_id'],
        'text':d['tweet'],
    })

    for i in d['comments']:
            flat_text.append({
                'tweet_id':i['tweet_id'],
                'text':flat_text[0]['text'] + i['tweet'],
            })
            if 'replies' in i.keys():
                for j in i['replies']:
                    flat_text.append({
                        'tweet_id':j['tweet_id'],
                        'text':flat_text[0]['text'] + i['tweet'] + j['tweet'],
                    })
    return flat_text

In [None]:
data_label = []
#for train
for i in range(len(labels)):
    for j in tr_flatten(data[i], labels[i]):
        data_label.append(j)
train_len = len(data_label)

In [None]:
df = pd.DataFrame(data_label, columns = data_label[0].keys(), index = None)

In [None]:
df.replace({"HOF":1,"NOT":0 }, inplace=True)

In [None]:
df.head()

Unnamed: 0,tweet_id,text,label
0,1534043530368552960,I believe that what Nupur Sharma ji said is ag...,1
1,1534078734642294784,I believe that what Nupur Sharma ji said is ag...,1
2,1534080292377460736,I believe that what Nupur Sharma ji said is ag...,1
3,1534086557283635200,I believe that what Nupur Sharma ji said is ag...,0
4,1534090171611635713,I believe that what Nupur Sharma ji said is ag...,1


In [None]:
df['label'].value_counts()

1    2524
0    2390
Name: label, dtype: int64

In [None]:
tweets = df.text
y = df.label

## Preprocessing and featuring the raw text

<p>This is a preprocessing function and the regex will match with anything that is not English, Hindi and Emoji.</p>
<p>The preprocessing steps are as followed:</p>
<ul>
    <li>Remove Handles</li>
    <li>Remove URLs</li>    
    <li>Remove anything that is not English, Hindi and Emoji</li>    
    <li>Remove RT which appears in retweets</li>    
    <li>Remove Abundant Newlines</li>    
    <li>Remove Abundant whitespaces</li>    
    <li>Remove Stopwords</li>
    <li>Stem English text</li>
    <li>Stem Hindi text</li>
</ul>

In [None]:
regex_for_english_hindi_emojis="[^a-zA-Z#\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF\u0900-\u097F]"
def clean_tweet(tweet):
    tweet = re.sub(r"@[A-Za-z0-9]+",' ', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+",' ', tweet)
    tweet = re.sub(regex_for_english_hindi_emojis,' ', tweet)
    tweet = re.sub("RT ", " ", tweet)
    tweet = re.sub("\n", " ", tweet)
    tweet = re.sub(r" +", " ", tweet)
    tokens = []
    for token in tweet.split():
        if token not in stopwords:
            token = english_stemmer.stem(token)
            token = hindi_stemmer.hi_stem(token)
            tokens.append(token)
    return " ".join(tokens)

In [None]:
cleaned_tweets = [clean_tweet(tweet) for tweet in tweets]

<p>Using TF-IDF for featuring the text. The vectorizer will only consider vocab terms that appear in more than 5 documents.</p>
<p>To learn more about TF-IDF you can check <a href = "https://towardsdatascience.com/tf-term-frequency-idf-inverse-document-frequency-from-scratch-in-python-6c2b61b78558">here</a> and <a href = "https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html">here</a>.</p>

In [None]:
vectorizer = TfidfVectorizer(min_df = 5)
X = vectorizer.fit_transform(cleaned_tweets)
X = X.todense()

## Training and evaluating model

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

<p>Training the Logistic Regression classifier provided by Scikit-Learn library.</p>
<p>To learn more about Logistic Regression classifier you can check <a href = "https://www.youtube.com/watch?v=yIYKR4sgzI8">here</a> and <a href = "https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html">here</a>.</p>

In [None]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression()

<p>Predicting and priting classification metrics for validation set.</p>

In [None]:
y_pred = classifier.predict(X_val)

In [None]:
print(classification_report(y_val, y_pred))


              precision    recall  f1-score   support

           0       0.69      0.70      0.70       719
           1       0.71      0.70      0.71       756

    accuracy                           0.70      1475
   macro avg       0.70      0.70      0.70      1475
weighted avg       0.70      0.70      0.70      1475



In [None]:
# !pip install lazypredict

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [None]:
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose=0,predictions=True)
models,predictions = clf.fit(X_train, X_val, y_train, y_val)
models

100%|██████████| 29/29 [08:34<00:00, 17.74s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
RandomForestClassifier,0.72,0.72,0.72,0.72,7.41
ExtraTreesClassifier,0.71,0.71,0.71,0.71,13.24
SVC,0.71,0.71,0.71,0.71,81.49
NuSVC,0.71,0.7,0.7,0.71,85.79
XGBClassifier,0.7,0.7,0.7,0.7,36.79
BaggingClassifier,0.7,0.7,0.7,0.7,19.3
NearestCentroid,0.69,0.69,0.69,0.69,0.67
LGBMClassifier,0.69,0.69,0.69,0.69,5.3
BernoulliNB,0.69,0.69,0.69,0.69,0.74
LogisticRegression,0.68,0.68,0.68,0.68,2.38


In [None]:
import plotly.express as px
fig = px.bar(models, x=models["F1 Score"], y=models.index,color=models["Accuracy"],
            title='Top Model Accuracy' ,width=800, height=900, template = 'plotly_white')
fig.update_layout(showlegend=False, font_size=18,yaxis={'categoryorder':'total ascending'})

In [None]:
from sklearn.metrics import confusion_matrix,classification_report

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestCentroid
from lightgbm import LGBMClassifier
# ligthgbm = LGBMClassifier( random_state=2021)
# import xgboost
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import VotingClassifier

# lda = LinearDiscriminantAnalysis()
# rcCV = RidgeClassifierCV()
# rc = RidgeClassifier()
# ccCV = CalibratedClassifierCV()
# xgb = xgboost()
import xgboost
xgb = xgboost.XGBClassifier()
lr = LogisticRegression()
nc = NearestCentroid()
lgb = LGBMClassifier()

Classifier=VotingClassifier(estimators=[('xgb',xgb), ('lr',lr),('nc',nc),('lgb',lgb)],voting='hard')
Classifier.fit(X_train,y_train)
y_pred = Classifier.predict(X_val)

In [None]:
print('Voting Model accuracy score: {0:0.4f}'.format(accuracy_score(y_val, y_pred)))
print("f1 score:",f1_score(y_val,y_pred))
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

Voting Model accuracy score: 0.6956
f1 score: 0.675343456254519
              precision    recall  f1-score   support

           0       0.66      0.78      0.71       719
           1       0.74      0.62      0.68       756

    accuracy                           0.70      1475
   macro avg       0.70      0.70      0.69      1475
weighted avg       0.70      0.70      0.69      1475

[[559 160]
 [289 467]]


In [None]:
# BAGGING ENSEMBLE TECHNIQUE
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
Classifier = BaggingClassifier(LogisticRegression(), n_estimators=50, max_samples=100, bootstrap=True, n_jobs=-1)
Classifier.fit(X_train,y_train)
y_pred = Classifier.predict(X_val)

print('Bagging Ensemble Technique accuracy score: {0:0.4f}'.format(accuracy_score(y_val, y_pred)))
print("f1 score:",f1_score(y_val,y_pred))
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

Bagging Ensemble Technique accuracy score: 0.6617
f1 score: 0.701376421304608
              precision    recall  f1-score   support

           0       0.70      0.54      0.61       719
           1       0.64      0.78      0.70       756

    accuracy                           0.66      1475
   macro avg       0.67      0.66      0.66      1475
weighted avg       0.67      0.66      0.66      1475

[[390 329]
 [170 586]]


In [None]:
# RANDOM FOREST ENSEMBLE TECHNIQUE
from sklearn.ensemble import RandomForestClassifier
Classifier = RandomForestClassifier(n_estimators=50, max_leaf_nodes=13, n_jobs=-1)
Classifier.fit(X_train,y_train)
y_pred = Classifier.predict(X_val)

print('Random Forest accuracy score: {0:0.4f}'.format(accuracy_score(y_val, y_pred)))
print("f1 score:",f1_score(y_val,y_pred))
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

Random Forest accuracy score: 0.6658
f1 score: 0.623951182303585
              precision    recall  f1-score   support

           0       0.62      0.80      0.70       719
           1       0.74      0.54      0.62       756

    accuracy                           0.67      1475
   macro avg       0.68      0.67      0.66      1475
weighted avg       0.68      0.67      0.66      1475

[[573 146]
 [347 409]]


In [None]:
# ADABOOST ENSEMBLE TECHNIQUE
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
Classifier = AdaBoostClassifier(DecisionTreeClassifier(min_samples_split=10,max_depth=3),n_estimators=50,learning_rate=0.6)
Classifier.fit(X_train,y_train)
y_pred = Classifier.predict(X_val)

print('Adaboost accuracy score: {0:0.4f}'.format(accuracy_score(y_val, y_pred)))
print("f1 score:",f1_score(y_val,y_pred))
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

Adaboost accuracy score: 0.6637
f1 score: 0.6648648648648648
              precision    recall  f1-score   support

           0       0.65      0.68      0.66       719
           1       0.68      0.65      0.66       756

    accuracy                           0.66      1475
   macro avg       0.66      0.66      0.66      1475
weighted avg       0.66      0.66      0.66      1475

[[487 232]
 [264 492]]


In [None]:
# GRADIENT BOOSTING ENSEMBLE TECHNIQUE
from sklearn.ensemble import GradientBoostingClassifier
Classifier = GradientBoostingClassifier(max_depth=7, n_estimators=30, learning_rate=0.01)
Classifier.fit(X_train,y_train)
y_pred = Classifier.predict(X_val)

print('Gradient boosting accuracy score: {0:0.4f}'.format(accuracy_score(y_val, y_pred)))
print("f1 score:",f1_score(y_val,y_pred))
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

Gradient boosting accuracy score: 0.6359
f1 score: 0.5587510271158586
              precision    recall  f1-score   support

           0       0.59      0.83      0.69       719
           1       0.74      0.45      0.56       756

    accuracy                           0.64      1475
   macro avg       0.66      0.64      0.62      1475
weighted avg       0.67      0.64      0.62      1475

[[598 121]
 [416 340]]


In [None]:
# XGBOOST ENSEMBLE TECHNIQUE
from sklearn.ensemble import GradientBoostingClassifier
import xgboost
Classifier = xgboost.XGBClassifier()
Classifier.fit(X_train,y_train)
y_pred = Classifier.predict(X_val)
print('Extreme Gradient boosting accuracy score: {0:0.4f}'.format(accuracy_score(y_val, y_pred)))
print("f1 score:",f1_score(y_val,y_pred))
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

Extreme Gradient boosting accuracy score: 0.6997
f1 score: 0.7064280980781974
              precision    recall  f1-score   support

           0       0.69      0.69      0.69       719
           1       0.71      0.71      0.71       756

    accuracy                           0.70      1475
   macro avg       0.70      0.70      0.70      1475
weighted avg       0.70      0.70      0.70      1475

[[499 220]
 [223 533]]


**NEURAL NETWORK**

In [None]:
le = LabelEncoder() #label encoding labels for training Dense Neural Network
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

In [None]:
model = Sequential(
    [
        Dense(64, activation="relu"),
        Dense(32, activation="relu"),
        Dense(1, activation="sigmoid"),
    ]
)
model.compile('adam', loss='binary_crossentropy', metrics = ['accuracy']) #compiling a neural network with 3 layers for classification

In [None]:
model.fit(X_train, y_train, epochs = 5, batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fedde3ae990>

In [None]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.5).astype('int64')
y_pred = y_pred.reshape(len(y_pred))    

In [None]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.68      0.74      0.71       719
           1       0.73      0.67      0.70       756

    accuracy                           0.70      1475
   macro avg       0.70      0.70      0.70      1475
weighted avg       0.70      0.70      0.70      1475



In [None]:
from glob import glob
test_directories = []
for i in glob("/content/finalzip/Hinglish_test/*/"):
    for j in glob(i+'*/'):
        test_directories.append(j)

In [None]:
test_directories

['/content/finalzip/Hinglish_test/islamophobia/1442176510224261120/',
 '/content/finalzip/Hinglish_test/islamophobia/1533444368690032641/',
 '/content/finalzip/Hinglish_test/temple_mosque_controversies/1467712410534309889/',
 '/content/finalzip/Hinglish_test/temple_mosque_controversies/1467731786151170049/',
 '/content/finalzip/Hinglish_test/celebrity_controversies/1438882238087659525/',
 '/content/finalzip/Hinglish_test/celebrity_controversies/1425321569350414343/',
 '/content/finalzip/Hinglish_test/historical_hindu_muslim/1445435933214617602/',
 '/content/finalzip/Hinglish_test/ozil/1438762071835951104/',
 '/content/finalzip/Hinglish_test/hinduphobia/1467895004223791105/',
 '/content/finalzip/Hinglish_test/hinduphobia/1470652707824291843/',
 '/content/finalzip/Hinglish_test/hinduphobia/1445930336039358469/',
 '/content/finalzip/Hinglish_test/russia_ukarain_conflict/1497413697056215043/',
 '/content/finalzip/Hinglish_test/farmer_protest/1480518248076509184/']

In [None]:
test_data = []
for i in test_directories:
    with open(i+'data.json', encoding='utf-8') as f:
        data.append(json.load(f))

In [None]:
test_tweetid_data = []
#for test
for i in range(len(labels), len(data)):
    for j in te_flatten(data[i]):
        test_tweetid_data.append(j)

In [None]:
test_tweetid_data

[{'tweet_id': '1442176510224261120',
  'text': 'Darinder Moodi showing mass grave where he buried 200,000,0000,0000 Muslims in 2002 Gujarat Riots. https://t.co/VjASMBEe0K'},
 {'tweet_id': '1442176690235461647',
  'text': 'Darinder Moodi showing mass grave where he buried 200,000,0000,0000 Muslims in 2002 Gujarat Riots. https://t.co/VjASMBEe0K@TheZaiduLeaks 😂😂 मान जा !!'},
 {'tweet_id': '1442341201479233543',
  'text': 'Darinder Moodi showing mass grave where he buried 200,000,0000,0000 Muslims in 2002 Gujarat Riots. https://t.co/VjASMBEe0K@TheZaiduLeaks 😂😂 मान जा !!@KamalKesari1 @TheZaiduLeaks Sarcasm tha bhai iski purani tweets dekhle pta chal jayega kon hai phir follow krlio okk bhai'},
 {'tweet_id': '1442176841322618880',
  'text': 'Darinder Moodi showing mass grave where he buried 200,000,0000,0000 Muslims in 2002 Gujarat Riots. https://t.co/VjASMBEe0K@TheZaiduLeaks No this is new grievyard in making..😂😂😂😂'},
 {'tweet_id': '1442177683899568133',
  'text': "Darinder Moodi showing ma

In [None]:
test_df = pd.DataFrame(test_tweetid_data, columns = test_tweetid_data[0].keys(), index = None)

In [None]:
test_df.head()

Unnamed: 0,tweet_id,text
0,1442176510224261120,Darinder Moodi showing mass grave where he bur...
1,1442176690235461647,Darinder Moodi showing mass grave where he bur...
2,1442341201479233543,Darinder Moodi showing mass grave where he bur...
3,1442176841322618880,Darinder Moodi showing mass grave where he bur...
4,1442177683899568133,Darinder Moodi showing mass grave where he bur...


In [None]:
test_df.shape

(2988, 2)

In [None]:
test_tweets = test_df.text
tweet_ids = test_df.tweet_id

In [None]:
cleaned_test = [clean_tweet(tweet) for tweet in test_tweets]

In [None]:

X_test = vectorizer.transform(cleaned_test)
X_test = X_test.todense()

In [None]:
submission_prediction = Classifier.predict(X_test)
submission = {'id': tweet_ids, 'label':submission_prediction}
submission = pd.DataFrame(submission)

In [None]:
submission.shape

(2988, 2)

In [None]:
submission.head(20)

Unnamed: 0,id,label
0,1442176510224261120,0
1,1442176690235461647,0
2,1442341201479233543,1
3,1442176841322618880,0
4,1442177683899568133,0
5,1442198857278844934,0
6,1442183360713953284,0
7,1442177876720189440,1
8,1442206853589467137,1
9,1442269798704967681,1


In [None]:
submission.to_csv('submission.csv', index = False)