In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import re
import string

In [2]:
path="/content/drive/MyDrive/DeepLearning/train.csv"
df= pd.read_csv(path)

In [3]:
df

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
...,...,...,...
1306117,ffffcc4e2331aaf1e41e,What other technical skills do you need as a c...,0
1306118,ffffd431801e5a2f4861,Does MS in ECE have good job prospects in USA ...,0
1306119,ffffd48fb36b63db010c,Is foam insulation toxic?,0
1306120,ffffec519fa37cf60c78,How can one start a research project based on ...,0


In [4]:
df.isnull().sum()

Unnamed: 0,0
qid,0
question_text,0
target,0


In [5]:
for col in ['question_text','target']:
  print(col,',',df[col].value_counts())

question_text , question_text
How did Quebec nationalists see their province as a nation in the 1960s?                                    1
What is the best method to teach the pronunciation of English words to kids?                                1
How much mark should we get to pass in NEET exam?                                                           1
I’m 13 and overweight. Aside from having a healthy diet, what exercises can I do at home to lose weight?    1
How do you get perfect teeth without using braces?                                                          1
                                                                                                           ..
Why are all psychotherapists so bad at their jobs?                                                          1
How many babies are born each day in India?                                                                 1
What are the milk substitutes for baking a cake?                                          

In [6]:
df.dtypes

Unnamed: 0,0
qid,object
question_text,object
target,int64


In [7]:
df

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
...,...,...,...
1306117,ffffcc4e2331aaf1e41e,What other technical skills do you need as a c...,0
1306118,ffffd431801e5a2f4861,Does MS in ECE have good job prospects in USA ...,0
1306119,ffffd48fb36b63db010c,Is foam insulation toxic?,0
1306120,ffffec519fa37cf60c78,How can one start a research project based on ...,0


In [8]:
df1=df.drop('qid',axis=1)

In [9]:
df1.dtypes
df1

Unnamed: 0,question_text,target
0,How did Quebec nationalists see their province...,0
1,"Do you have an adopted dog, how would you enco...",0
2,Why does velocity affect time? Does velocity a...,0
3,How did Otto von Guericke used the Magdeburg h...,0
4,Can I convert montra helicon D to a mountain b...,0
...,...,...
1306117,What other technical skills do you need as a c...,0
1306118,Does MS in ECE have good job prospects in USA ...,0
1306119,Is foam insulation toxic?,0
1306120,How can one start a research project based on ...,0


In [10]:
def preprocess_text(text):
    # Remove punctuation and lowercase everything
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    return text

In [11]:
df['cleaned_text'] = df['question_text'].apply(preprocess_text)

In [12]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['cleaned_text'])

In [13]:
Y = df['target']

In [14]:
print(X)

  (0, 68813)	0.20353340501475814
  (0, 181194)	0.4550531436181745
  (0, 151837)	0.4605798602266281
  (0, 178656)	0.44884840033718737
  (0, 151812)	0.3563042804801597
  (0, 4421)	0.4592922327954161
  (1, 17735)	0.47042153569112444
  (1, 71787)	0.3586445257593875
  (1, 78679)	0.4655727896244624
  (1, 168089)	0.21559530664995996
  (1, 17732)	0.4553121598366003
  (1, 201019)	0.42370881666253335
  (2, 71733)	0.28406853343162114
  (2, 233399)	0.6456303528611592
  (2, 18344)	0.5090591128704168
  (2, 221574)	0.1925185247087092
  (2, 206884)	0.26393742771389567
  (2, 94609)	0.3695930022033987
  (3, 68813)	0.15977614254237288
  (3, 162615)	0.392700998070055
  (3, 235743)	0.3676994407063329
  (3, 99428)	0.4879766636381452
  (3, 231454)	0.19986797909705797
  (3, 135577)	0.4632809314174789
  (3, 103862)	0.4385851991968126
  :	:
  (1306118, 71733)	0.17714402139360316
  (1306118, 97016)	0.2158697471497727
  (1306118, 120471)	0.31697470040374887
  (1306118, 112942)	0.2250318848033231
  (1306118, 13127

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [16]:
print(X_train)

  (0, 166217)	0.4021979826373128
  (0, 212319)	0.5530708070695816
  (0, 130245)	0.7296228238822285
  (1, 185227)	0.6197402290632166
  (1, 170707)	0.5373719383773476
  (1, 104671)	0.5719732933671327
  (2, 194895)	0.5060669121895878
  (2, 184254)	0.5753506316516922
  (2, 40915)	0.6425479990202249
  (3, 136361)	0.24008957024775704
  (3, 99196)	0.29270514012167337
  (3, 164390)	0.27380871008954677
  (3, 137642)	0.3493837065243214
  (3, 57136)	0.3327553523605286
  (3, 120256)	0.5163473912899779
  (3, 197951)	0.531318889258117
  (4, 198877)	0.19028641651368586
  (4, 101709)	0.18523791341171858
  (4, 115619)	0.18398909467662924
  (4, 18978)	0.20240306642956074
  (4, 198851)	0.18029618072948317
  (4, 133053)	0.21204997286027583
  (4, 208933)	0.23036672213700027
  (4, 164964)	0.36644409251007526
  (4, 64660)	0.2523020074324886
  :	:
  (914282, 132139)	0.2738834139089024
  (914282, 171998)	0.3097478121869151
  (914282, 148335)	0.3312793063521553
  (914282, 46327)	0.43502519829251096
  (914282, 3

In [17]:
print(y_train)

6750       0
1264797    0
867014     0
1068814    0
161774     0
          ..
110268     0
259178     0
131932     0
671155     0
121958     0
Name: target, Length: 914285, dtype: int64


In [18]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [19]:
y_pred = model.predict(X_test)

In [20]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9506937833844175
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97    367973
           1       0.68      0.36      0.47     23864

    accuracy                           0.95    391837
   macro avg       0.82      0.67      0.72    391837
weighted avg       0.94      0.95      0.94    391837



In [21]:
import joblib

In [22]:
joblib.dump(model, 'Spam_filter_for_Quora questions.joblib')


['Spam_filter_for_Quora questions.joblib']

In [23]:
from google.colab import files

In [24]:
files.download('Spam_filter_for_Quora questions.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>