In [1]:
import nltk
import pandas as pd

nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer, RegexpStemmer
from nltk.stem import WordNetLemmatizer
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.classify import NaiveBayesClassifier
# from imblearn.over_sampling import SMOTE


from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

from sklearn import metrics

[nltk_data] Downloading package omw-1.4 to /home/ujjwal/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ujjwal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ujjwal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ujjwal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Reading dataset

In [3]:
df_train = pd.read_csv("dataset/train_df.csv")
df_test = pd.read_csv("dataset/test_df.csv")

In [4]:
df_train.shape

(1000000, 3)

Finding duplicates

In [5]:
len(df_train[df_train.duplicated()])
len(df_test[df_test.duplicated()])

0

In [4]:
troll_questions = df_train['target'][df_train['target'] == 1].index
genuine_questions = df_train['target'][df_train['target'] == 0].index

In [5]:
len(genuine_questions)/len(troll_questions)

15.162922256343947

## VECTORIZATION

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_validate, Y_train, Y_validate = train_test_split(df_train['question_text'], df_train['target'], test_size=0.1, train_size=0.9, shuffle=True)

X_test = df_test["question_text"]

In [7]:
X_validate.head()

43412     If you could tell one thing to your 10 year ol...
69323     How is the placement department at BMSCE Banga...
762053                             Who discovered eletrons?
608276    What do smart students do for timepass on inte...
876809    Why should you not bring too much money in you...
Name: question_text, dtype: object

In [10]:
Y_validate.head()

656554    0
609948    0
89026     0
176332    0
946346    0
Name: target, dtype: int64

In [11]:
X_train

978548                 How do I love a woman that loves me?
861936         What weight air hostess should have and age?
31594     Is there any meat that cannot be eaten or is d...
457591                       What is the AIIMS UG syllabus?
893069    Why is the price of Mishra Dhatu Nigam stock s...
                                ...                        
221513    Could Harrison Wells or Devoe from the Flash p...
316272    Why is it when I sign up for a class at a coll...
938623    Why aren't Muslim community ashamed of their h...
333430    Was Stephen Hawking battling depression toward...
147664         Who decided to kill John Fitzgerald Kennedy?
Name: question_text, Length: 900000, dtype: object

In [12]:
# # from sklearn.feature_extraction.text import CountVectorizer

# # We can change the number of words counted together and see results on accuracy
# # CountVectorizer(analyzer='word', ngram_range=(1, 3))

# # vectorizer = CountVectorizer()

# # X_train = vectorizer.fit_transform(question_text_train.values.astype('U'))
# Y_train = df_train['target']

In [13]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# # # We can change the number of words counted together and see results on accuracy
# # # CountVectorizer(analyzer='word', ngram_range=(1, 3))

# tfidf = TfidfVectorizer(
#     strip_accents = 'unicode',
#     analyzer = 'word',
#     ngram_range = (1, 1),
#     max_features = 150000
# )
# tfidf.fit(df_train['question_text'])

# X_train1 = tfidf.transform(X_train)
# X_validate1 = tfidf.transform(X_validate)
# X_test1 = tfidf.transform(X_test)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

# # from sklearn.feature_extraction.text import TfidfVectorizer

# # We can change the number of words counted together and see results on accuracy
# CountVectorizer(analyzer='word', ngram_range=(1, 3))

word_vectorizer = TfidfVectorizer(
    strip_accents = 'unicode',
    analyzer = 'word',
    ngram_range = (1, 3),
    max_df = 0.5,
    max_features = 150000,
)

char_vectorizer = TfidfVectorizer(
    strip_accents = 'unicode',
    analyzer = 'char',
    ngram_range = (1, 3),
    max_df = 0.5,
    max_features = 150000,
)

# # tfidf = TfidfVectorizer()

X_train1_wv = word_vectorizer.fit_transform(X_train.values.astype('U'))
X_validate1_wv = word_vectorizer.transform(X_validate.values.astype('U'))

X_train1_cv = char_vectorizer.fit_transform(X_train.values.astype('U'))
X_validate1_cv = char_vectorizer.transform(X_validate.values.astype('U'))
# # X_train = tfidf.fit(X_train)

X_train1 = hstack((X_train1_wv, X_train1_cv)).tocsr()
X_validate1 = hstack((X_validate1_wv, X_validate1_cv)).tocsr()


X_test1_wv = word_vectorizer.transform(X_test)
X_test1_cv = char_vectorizer.transform(X_test)

X_test1 = hstack((X_test1_wv, X_test1_cv)).tocsr()


# BALANCING DATA

## Over-Sampling

In [15]:
# from collections import Counter
# from imblearn.over_sampling import RandomOverSampler

In [16]:
# ros = RandomOverSampler(random_state=42)
# X_train1, Y_train = ros.fit_resample(X_train1, Y_train)
# X_validate1, Y_validate = ros.fit_resample(X_validate1, Y_validate)

In [17]:
# sm = SMOTE(random_state=42)
# X_train1, Y_train = sm.fit_resample(X_train1, Y_train)
# X_validate1, Y_validate = sm.fit_resample(X_validate1, Y_validate)

In [18]:
# X_train1.shape

## Under-Sampling

In [19]:
# from imblearn.under_sampling import RandomUnderSampler

# ros = RandomUnderSampler(random_state=42)
# X_train1, Y_train = ros.fit_resample(X_train1, Y_train)
# X_validate1, Y_validate = ros.fit_resample(X_validate1, Y_validate)

In [20]:
# X_train1.shape

# GENERATING MODELS

## MODELS

## 2. Multinomial Naive Bayes

In [21]:
# multi_naive_bayes_model = MultinomialNB()

# multi_naive_bayes_model.fit(X_train1, Y_train)

# Y_pred_mnb = multi_naive_bayes_model.predict(X_validate1)
# pred_MNB = metrics.f1_score(Y_validate, Y_pred_mnb)

# print(f"Multinomial Naive Bayes: {pred_MNB}")

In [22]:
# Y_pred_mnb = multi_naive_bayes_model.predict(X_test1)

# # result_target = pd.DataFrame(Y_pred_mnb, columns=['target'])
# # question_id = df_test['qid']

# # result = pd.concat([question_id, result_target], axis=1, join='inner')

# # result.to_csv("seventh_submission_CV_multiNB.csv", index=False)

## 3. Logistic Regression

In [13]:
logistic_regression_model = LogisticRegression(
    dual = False,
    class_weight = {0: 0.23, 1: 0.77}
)
logistic_regression_model.fit(X_train1, Y_train)

Y_pred_lr = logistic_regression_model.predict(X_validate1)
pred_LR = metrics.f1_score(Y_validate, Y_pred_lr)

print(f"Logistic Regression  : {pred_LR}")

Logistic Regression  : 0.6391398491749422


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Changing Threshold of Logistic Regression

In [14]:
def custom_predict(X, threshold):
    probs = logistic_regression_model.predict_proba(X) 
    return (probs[:, 1] > threshold).astype(int)
    
Y_pred_lr = custom_predict(X=X_validate1, threshold=0.425)
pred_LR = metrics.f1_score(Y_validate, Y_pred_lr)

print(f"Logistic Regression  : {pred_LR}")

Logistic Regression  : 0.6336361134048996


### Confusion Matrix for different Class-Weights

In [15]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_validate, Y_pred_lr)

array([[90072,  3766],
       [ 1558,  4604]])

In [16]:
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
 
# color = 'white'
# matrix = plot_confusion_matrix(logistic_regression_model, X_validate1, Y_validate, cmap=plt.cm.Blues)
# matrix.ax_.set_title('Confusion Matrix', color=color)
# plt.xlabel('Predicted Label', color=color)
# plt.ylabel('True Label', color=color)
# plt.gcf().axes[0].tick_params(colors=color)
# plt.gcf().axes[1].tick_params(colors=color)
# plt.show()

ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (/home/ujjwal/.local/lib/python3.8/site-packages/sklearn/metrics/__init__.py)

In [25]:
Y_pred_lr = logistic_regression_model.predict(X_test1)

# Updating the classification based on new threshold
Y_pred_lr = custom_predict(X=X_test1, threshold=0.39)

result_target = pd.DataFrame(Y_pred_lr, columns=['target'])
question_id = df_test['qid']

result = pd.concat([question_id, result_target], axis=1, join='inner')

result.to_csv("50th_submission_tuning_threshold_LR.csv", index=False)

In [26]:
# print("f1-score using:\n")
# print(f"Multinomial Naive Bayes: {pred_MNB}")
# print(f"Logistic Regression    : {pred_LR}")

## SVM

In [27]:
# svc_model = LinearSVC(dual=False, class_weight={0: 0.23, 1: 0.77})
# svc_model.fit(X_train1, Y_train)

# Y_pred_svc = svc_model.predict(X_validate1)
# pred_SVC = metrics.f1_score(Y_validate, Y_pred_svc)

# print(f"Linear Support Vector: {pred_SVC}")

In [28]:
# Y_pred_svc = svc_model.predict(X_test1)

# result_target = pd.DataFrame(Y_pred_svc, columns=['target'])
# question_id = df_test['qid']

# result = pd.concat([question_id, result_target], axis=1, join='inner')

# result.to_csv("eleventh_submission_vectorizer_SVC.csv", index=False)

## VOTING

### Trying to vote results of models: Multinomial Naive Bayes, Logistic Regression, Linear SVM

In [29]:
# import numpy as np

# def changeToYesNo(x):
#   if x > 1:
#     return 1
#   else:
#     return 0 

# Y_sum = Y_pred_mnb + Y_pred_lr + Y_pred_svc

# mixed_pred = np.array([changeToYesNo(x) for x in Y_sum])

# # mixed_pred = metrics.f1_score(Y_validate, Y_sum)
# # print(f"Mixed Prediction: {mixed_pred}")

In [30]:
# result_target = pd.DataFrame(mixed_pred, columns=['target'])
# question_id = df_test['qid']

# result = pd.concat([question_id, result_target], axis=1, join='inner')

# result.to_csv("seventeen_submission_vectorizer_SVC.csv", index=False)

## BAGGING ENSEMBLE

In [31]:
# from sklearn.svm import SVC
# from sklearn.ensemble import BaggingClassifier

# clf = BaggingClassifier(base_estimator=SVC(),n_estimators=10, random_state=0)
# clf.fit(X_train1, Y_train)

# Y_pred_bagging = clf.predict(X_validate1)
# pred_bagging = metrics.f1_score(Y_validate, Y_pred_bagging)

In [32]:
# print(f"Bagging ensemble: {pred_bagging}")

## XG Boost

In [33]:
# from sklearn.ensemble import GradientBoostingClassifier

# boost_model = GradientBoostingClassifier()
# boost_model.fit(X_train1, Y_train)

# Y_pred_boost = boost_model.predict(X_validate1)
# pred_boost = metrics.f1_score(Y_validate, Y_pred_boost)

# print(pred_boost)

## RANDOM FOREST CLASSIFIER

In [34]:
# from sklearn.ensemble import RandomForestClassifier

In [35]:
# pred_rfc = 0.00  # in case the model is not run

# rfc_model = RandomForestClassifier(
#   class_weight= {0: 0.23, 1: 0.77}
# )

# rfc_model.fit(X_train1, Y_train)

# pred = rfc_model.predict(X_validate1)
# pred_rfc = metrics.f1_score(Y_validate, pred)

# print(f"Random Forest Classifier: {pred_rfc}")

# RESULTS

In [36]:
# print("f1-score using:\n")
# print(f"Multinomial Naive Bayes : {pred_MNB}")
# print(f"Logistic Regression     : {pred_LR}")
# print(f"Linear Support Vector   : {pred_SVC}")
# print(f"Random Forest Classifier: {pred_rfc}")
# print(f"XG Boost                : {pred_boost}")