# Importing important libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
from scipy import sparse
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier 

# Reading input files and preprocessing

In [2]:
df1 =  pd.read_csv("train_df.csv")
df1.head()

Unnamed: 0,qid,question_text,target
0,dda0b0efc8ba86e81ec4,What are interesting facts about Microsoft his...,0
1,dc708b74a108d0fc0ad9,What are those things which are not gonna happ...,0
2,06a27ec5d82dacd8bfe0,"What should I know to avoid being ""upsold"" whe...",0
3,00cbb6b17e3ceb7c5358,How I add any account with payment bank?,0
4,7c304888973a701585a0,Which Multi level marketing products are actua...,0


In [3]:
df1.shape

(1000000, 3)

In [4]:
df1.isnull().sum()

qid              0
question_text    0
target           0
dtype: int64

In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   qid            1000000 non-null  object
 1   question_text  1000000 non-null  object
 2   target         1000000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 22.9+ MB


In [6]:
df1.duplicated().sum()

0

In [7]:
df2 =  pd.read_csv("test_df.csv")
df2.head()

Unnamed: 0,qid,question_text
0,a4f3da3a3df9dd881edd,My period is due on my wedding day. How can I ...
1,9914c62ed3f69684d549,How many numbers higher than a million can be ...
2,8138ae48649e37091a91,"How come I feel nothing for my family, but sti..."
3,981b4753d17ef14d09f7,"In case of collapse of the Democratic party, w..."
4,452e2c705276ba16b7b7,Who is Émile Naoumoff?


In [8]:
df2.shape

(306122, 2)

# Extracting statistical data from the input files

In [9]:
# import seaborn as sns

In [10]:
df1['target'].value_counts()

0    938130
1     61870
Name: target, dtype: int64

In [11]:
df1['char_count'] = df1['question_text'].apply(len)
df2['char_count'] = df2['question_text'].apply(len)

In [12]:
# df1['word_count'] = df1['question_text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [13]:
# df1['sentence_count'] = df1['question_text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [14]:
# df1[['char_count', 'word_count', 'sentence_count']].describe()

In [15]:
# df1[df1['target']==0][['char_count', 'word_count', 'sentence_count']].describe()

In [16]:
# df1[df1['target']==1][['char_count', 'word_count', 'sentence_count']].describe()

In [17]:
# plt.figure(figsize=(12, 4))
# sns.histplot(df1[df1['target']==0]['char_count'], color = 'green')
# sns.histplot(df1[df1['target']==1]['char_count'], color = 'red')

# Importing the Natural Language ToolKit for Exploratory Data Analysis(EDA)

In [18]:

import nltk
import nltk.corpus
from nltk.tokenize import word_tokenize,sent_tokenize

Converting all characters to lower case

In [19]:
def lower_case(question_text):
    question_text = question_text.lower()
    return question_text

In [20]:
lower_case("Hi how r you")

'hi how r you'

Tokenizing the sentences into words

In [21]:
def tokenize_word(question_text):
    question_text = nltk.word_tokenize(question_text)
    return question_text

In [22]:
tokenize_word(lower_case("Hi how are you?"))

['hi', 'how', 'are', 'you', '?']

Removing non-alphanumeric and stopwords from the sentences

In [23]:
from nltk.corpus import stopwords
sw = stopwords.words('english')

In [24]:
def special_char(question_text):
    arr = []
    #Not removing Alphanumeric characters as people tend to make typos and this suggests it is not spam
    for i in question_text:
        if(i not in sw):
            arr.append(i)
    return arr

In [25]:
# special_char('how123!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~')

Lemmitization (Reducing to basic form)

In [26]:
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemma(text):
    lemm_text = [lemmatizer.lemmatize(word) for word in text]
    return lemm_text

[nltk_data] Downloading package omw-1.4 to /home/teja/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Merging train and test data

In [27]:
df = df1.drop(columns=['target','char_count'])
df2 = df2.drop(columns=['char_count'])
# df = pd.concat([df,df2],ignore_index=True)

**Calling all functions**

In [28]:
# arr = []
# for text in df['question_text']:
#     arr.append(text)

In [29]:
# token = []
# for text in arr:
#     token.append(tokenize_word(text))
# token

In [30]:
# special = token
# for i in token:
#     special.append(special_char(i))
# special

In [31]:
# lem = []
# for i in special:
#     lem.append(lemma(i))
# lem

In [32]:
# final = arr
# for arr in special:
#     final.append(' '.join(arr))
# len(final)

In [33]:
# final[0]

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
# X = pd.DataFrame(final[:1000000],columns=['data'])
# X = pd.concat([X,df1['target']],axis=1)
X = df1[['question_text','target']].copy()
X_train, X_test = train_test_split(X, test_size=0.1,shuffle=True)
Z = df2[['question_text']].copy()

In [36]:
X['question_text']

0         What are interesting facts about Microsoft his...
1         What are those things which are not gonna happ...
2         What should I know to avoid being "upsold" whe...
3                  How I add any account with payment bank?
4         Which Multi level marketing products are actua...
                                ...                        
999995                           How is CSE at VIT Chennai?
999996    How can we prevent a holocaust by robots, AI, ...
999997    How can I help a student remember key steps an...
999998    What is the difference between lace closure & ...
999999     What happens when you look into a broken mirror?
Name: question_text, Length: 1000000, dtype: object

In [37]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
# CountVec = CountVectorizer(lowercase=False,ngram_range=(1,2))

word_vectorizer = TfidfVectorizer(strip_accents = 'unicode',analyzer = 'word',ngram_range = (1, 3),max_df = 0.5,max_features = 10000,)

char_vectorizer = TfidfVectorizer(strip_accents = 'unicode',analyzer = 'char',ngram_range = (1, 3),max_df = 0.5,max_features = 10000,)

X_train_countW = word_vectorizer.fit_transform(X_train['question_text'])
X_test_countW = word_vectorizer.transform(X_test['question_text'])
trainW = word_vectorizer.fit_transform(X['question_text'])
testW = word_vectorizer.transform(Z['question_text'])


In [38]:
X_train_countC = char_vectorizer.fit_transform(X_train['question_text'])
X_test_countC = char_vectorizer.transform(X_test['question_text'])
trainC = char_vectorizer.fit_transform(X['question_text'])
testC = char_vectorizer.transform(Z['question_text'])

Horizontally stacking the vectors 

In [39]:
# countcsr_train = sparse.csr_matrix(X['char_count']).transpose()
# countcsr_test = sparse.csr_matrix(Z['char_count']).transpose()

In [40]:

X_train_count = hstack((X_train_countW,X_train_countC)).tocsr()
X_test_count = hstack((X_test_countW,X_test_countC)).tocsr()
train = hstack((trainW,trainC)).tocsr()
test = hstack((testW,testC)).tocsr()

<font size=6>
Models

In [41]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score,confusion_matrix,ConfusionMatrixDisplay,roc_auc_score, roc_curve, precision_recall_curve

In [42]:
class LogisticRegressionWithThreshold(LogisticRegression):
    def predict(self, X, threshold=None):
        if threshold == None: # If no threshold passed in, simply call the base class predict, effectively threshold=0.5
            return LogisticRegression.predict(self, X)
        else:
            y_scores = LogisticRegression.predict_proba(self, X)[:, 1]
            y_pred_with_threshold = (y_scores >= threshold).astype(int)

            return y_pred_with_threshold
    
    def threshold_from_optimal_tpr_minus_fpr(self, X, y):
        y_scores = LogisticRegression.predict_proba(self, X)[:, 1]
        fpr, tpr, thresholds = roc_curve(y, y_scores) 

        optimal_idx = np.argmax(tpr - fpr)

        return thresholds[optimal_idx], tpr[optimal_idx] - fpr[optimal_idx]
    
    def threshold_from_desired_precision(self, X, y, desired_precision=0.9):
        y_scores = LogisticRegression.predict_proba(self, X)[:, 1]
        precisions, recalls, thresholds = precision_recall_curve(y, y_scores)

        desired_precision_idx = np.argmax(precisions >= desired_precision)
        
        return thresholds[desired_precision_idx], recalls[desired_precision_idx]

    def threshold_from_desired_recall(self, X, y, desired_recall=0.9):
        y_scores = LogisticRegression.predict_proba(self, X)[:, 1]
        precisions, recalls, thresholds = precision_recall_curve(y, y_scores)

        desired_recall_idx = np.argmin(recalls >= desired_recall)
        
        return thresholds[desired_recall_idx], precisions[desired_recall_idx]
    
    def default_cost_function(tn, fp, fn, tp):
        cost = 0
        
        cost += (tn * -10000) 
        cost += (fp * 1000) 
        cost += (fn * 1500) 
        cost += (tp * -20000) 
        
        return cost

    def threshold_from_cost_function(self, X, y, cost_function):
        y_scores = LogisticRegression.predict_proba(self, X)[:, 1]
        precisions, recalls, thresholds = precision_recall_curve(y, y_scores)

        costs = []
        tns = []
        fps = []
        fns = []
        tps = []
        
        for threshold in thresholds:
            y_pred_with_threshold = (y_scores >= threshold).astype(int)
            tn, fp, fn, tp = confusion_matrix(y, y_pred_with_threshold).ravel()
            costs.append(cost_function(tn, fp, fn, tp))
            tns.append(tn), fps.append(fp), fns.append(fn), tps.append(tp)

        df_cost = pd.DataFrame({'precision':precisions[:-1], 'recall':recalls[:-1], 'threshold':thresholds, 'cost':costs, 'tn':tns, 'fp':fps, 'fn':fns, 'tp':tps})
        
        min_cost = df_cost['cost'].min()
        threshold = df_cost[df_cost['cost']==min_cost].iloc[0]['threshold']

        return threshold, min_cost, df_cost
        
    def threshold_from_optimal_f_score(self, X, y):
        y_scores = LogisticRegression.predict_proba(self, X)[:, 1]
        precisions, recalls, thresholds = precision_recall_curve(y, y_scores)

        fscores = (2 * precisions * recalls) / (precisions + recalls)
        
        optimal_idx = np.argmax(fscores)
        
        return thresholds[optimal_idx], fscores[optimal_idx]

    def threshold_from_optimal_accuracy(self, X, y):
            y_scores = LogisticRegression.predict_proba(self, X)[:, 1]
            precisions, recalls, thresholds = precision_recall_curve(y, y_scores)

            accuracies = []
            tns = []
            fps = []
            fns = []
            tps = []

            for threshold in thresholds:
                y_pred_with_threshold = (y_scores >= threshold).astype(int)
                tn, fp, fn, tp = confusion_matrix(y, y_pred_with_threshold).ravel()

                accuracies.append((tp+tn)/(tn+fp+fn+tp))
                tns.append(tn), fps.append(fp), fns.append(fn), tps.append(tp)

            df_accuracy = pd.DataFrame({'threshold':thresholds, 'accuracy':accuracies, 'tn':tns, 'fp':fps, 'fn':fns, 'tp':tps})

            max_accuracy = df_accuracy['accuracy'].max()
            threshold = df_accuracy[df_accuracy['accuracy']==max_accuracy].iloc[0]['threshold']

            return threshold, max_accuracy, df_accuracy

In [43]:
mnb = MultinomialNB()
lr = LogisticRegression(dual=False,class_weight={0: 0.23,1: 0.77})
gnb = GaussianNB()

<font size=6>
Testing with split train data

In [44]:
mnb.fit(X_train_count,X_train['target'])
split_res = mnb.predict(X_test_count)
f1_score(X_test['target'],split_res)

### Code for custom threshold (Takes too long)

In [None]:
# lrt = LogisticRegressionWithThreshold
# lrt.fit(X_train_count,X_train['target'])
# threshold, max_accuracy, df_accuracy = lrt.threshold_from_optimal_accuracy(X_train_count, X_train['target'])
# y_pred = lrt.predict(X_train_count, threshold)

# threshold, max_accuracy

### Default logistic regression

In [None]:
lr.fit(X_train_count,X_train['target'])
split_res = lr.predict(X_test_count)
cm = confusion_matrix(X_test['target'], split_res, labels=lr.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=lr.classes_)
disp.plot()
plt.show()

In [None]:
# rfc = RandomForestClassifier(max_depth=10,random_state=0,bootstrap=True,max_features=150000)

In [None]:
# rfc.fit(X_train_count,X_train['target'])
# split_res = rfc.predict(X_test_count)
# f1_score(X_test['target'],split_res)

<font size=6>
Obtaining predictions for given test data

In [None]:
lr.fit(train,df1['target'])
mnb.fit(train,df1['target'])

In [None]:
res = lr.predict(test)
len(res)
finaldf = pd.DataFrame(res,columns=['target'])
finaldf

In [None]:
finaldf = pd.DataFrame(list(zip(df2['qid'],finaldf['target'])),columns =['qid', 'target'])
finaldf

In [None]:
res2 = mnb.predict(test)
# len(res)
finaldf2 = pd.DataFrame(res2,columns=['target'])

In [None]:
finaldf2 = pd.DataFrame(list(zip(df2['qid'],finaldf2['target'])),columns =['qid', 'target'])

In [None]:
finaldf.to_csv('fourthlr.csv',index=False)
finaldf2.to_csv('fourth.csv',index=False)

In [None]:
# gnb.fit(X_train_count,X_train['target'])
# split_res = gnb.predict(X_test_count)
# f1_score(X_test['target'],split_res)
# Needs dense data, given sparse matrix

In [None]:
# #Stacking
# esti = [('gnb', gnb), ('mnb',mnb),('lr', lr)]
# final_est = RandomForestClassifier()

In [None]:
# from sklearn.ensemble import StackingClassifier
# clf = StackingClassifier(estimators=estim, final_estimator=final_est)

In [None]:
# clf.fit(X_train,y_train)
# y_pred = clf.predict(X_test)
# print("Accuracy",accuracy_score(y_test,y_pred))
# print("Precision",precision_score(y_test,y_pred))

In [None]:
# #Bagging Bootstrapping
# dtree = DecisionTreeClassifier(random_state = 22)
# dtree.fit(X_train,y_train)

# y_pred = dtree.predict(X_test)