## Installing demoji

In [2]:
!pip install demoji

## Imports

In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report


from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

## Loading data

In [3]:
df_train = pd.read_excel('..........', names=["ID","Tweets","Labels"])
df_train.dropna(inplace=True)
df_train.reset_index(drop=True, inplace=True)

df_val = pd.read_csv('..........', names=["ID","Tweets","Labels"])
df_val.dropna(inplace=True)
df_val.reset_index(drop=True, inplace=True)

In [5]:
df_train_Tweet = pd.DataFrame(df_train.Tweets)
df_val_Tweet = pd.DataFrame(df_val.Tweets)
df_train_Label = pd.DataFrame(df_train.Labels)
df_val_Label = pd.DataFrame(df_val.Labels)


## Preprocessing

In [7]:
import re, string
regex = re.compile('[%s]' % re.escape(string.punctuation))
import demoji

def preprocessing(document):
        document = demoji.replace_with_desc(document).replace(":"," ").replace("-"," ")
        document = regex.sub(' ', document)
        document = re.sub(r'[0-9]', '', document)
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)             # remove all single characters
        document = re.sub(r'\s+', ' ', document, flags=re.I)            # Substituting multiple spaces with single space
        return document

In [8]:
corpus_train = df_train_Tweet.Tweets.apply(preprocessing)
corpus_val = df_val_Tweet.Tweets.apply(preprocessing)

## TFIDF Vectorization

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(ngram_range=(1,5))
cv = TfidfVectorizer(max_features=15000)
X_train = cv.fit_transform(corpus_train).toarray()
X_test = cv.transform(corpus_val).toarray()
X_train.shape

(3999, 15000)

In [10]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler,MinMaxScaler
sc = MinMaxScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## LOGISTIC REGRESSION

In [11]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(multi_class='multinomial', solver='lbfgs',verbose=3,max_iter=250,penalty='l2',C=1,n_jobs=4)
classifier.fit(X_train , np.ravel(df_train_Label))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:   17.9s finished


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=250,
                   multi_class='multinomial', n_jobs=4, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=3,
                   warm_start=False)

In [12]:
y_pred = classifier.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df_val_Label , y_pred)
accuracy

0.7024185068349106

In [13]:
print(classification_report(df_val_Label,y_pred))

              precision    recall  f1-score   support

         NOT       0.68      0.75      0.72       473
         OFF       0.73      0.65      0.69       478

    accuracy                           0.70       951
   macro avg       0.70      0.70      0.70       951
weighted avg       0.70      0.70      0.70       951



## RANDOM FOREST

In [14]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'gini', random_state = 0 , max_depth=100)
classifier.fit(X_train , np.ravel(df_train_Label))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=100, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [15]:
y_pred = classifier.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df_val_Label , y_pred)
accuracy

0.6792849631966351

In [16]:
print(classification_report(df_val_Label,y_pred))

              precision    recall  f1-score   support

         NOT       0.63      0.85      0.73       473
         OFF       0.78      0.51      0.61       478

    accuracy                           0.68       951
   macro avg       0.70      0.68      0.67       951
weighted avg       0.71      0.68      0.67       951



## NAIVE BAYES

In [17]:
#Naive bayes classification
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB(alpha = 1 , fit_prior=True, class_prior=None)
classifier.fit(X_train , np.ravel(df_train_Label))

MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

In [18]:
y_pred = classifier.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df_val_Label , y_pred)
accuracy

0.6971608832807571

In [19]:
print(classification_report(df_val_Label,y_pred))

              precision    recall  f1-score   support

         NOT       0.68      0.73      0.71       473
         OFF       0.71      0.66      0.69       478

    accuracy                           0.70       951
   macro avg       0.70      0.70      0.70       951
weighted avg       0.70      0.70      0.70       951



## XGBOOST

In [20]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [21]:
classifier = XGBClassifier()
classifier.fit(X_train , np.ravel(df_train_Label))

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [22]:
y_pred = classifier.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df_val_Label , y_pred)
accuracy

0.6267087276550999

In [23]:
print(classification_report(df_val_Label,y_pred))

              precision    recall  f1-score   support

         NOT       0.58      0.92      0.71       473
         OFF       0.81      0.33      0.47       478

    accuracy                           0.63       951
   macro avg       0.70      0.63      0.59       951
weighted avg       0.70      0.63      0.59       951



## SVM

In [24]:
from sklearn.svm import SVC
classifier=SVC(random_state=123)
classifier.fit(X_train , np.ravel(df_train_Label))

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=123, shrinking=True, tol=0.001,
    verbose=False)

In [25]:
y_pred = classifier.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(df_val_Label , y_pred)
accuracy

0.6845425867507886

In [26]:
print(classification_report(df_val_Label,y_pred))

              precision    recall  f1-score   support

         NOT       0.66      0.77      0.71       473
         OFF       0.72      0.60      0.66       478

    accuracy                           0.68       951
   macro avg       0.69      0.68      0.68       951
weighted avg       0.69      0.68      0.68       951

