### Data Exploration

In [1]:
import pandas as pd

In [2]:
train_fname = '/kaggle/input/quora-insincere-questions-classification/train.csv'
test_fname = '/kaggle/input/quora-insincere-questions-classification/test.csv'
submission_fname = '/kaggle/input/quora-insincere-questions-classification/sample_submission.csv'

In [3]:
df = pd.read_csv(train_fname)
test_df = pd.read_csv(test_fname)
submission_df = pd.read_csv(submission_fname)

In [4]:
df

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0
...,...,...,...
1306117,ffffcc4e2331aaf1e41e,What other technical skills do you need as a c...,0
1306118,ffffd431801e5a2f4861,Does MS in ECE have good job prospects in USA ...,0
1306119,ffffd48fb36b63db010c,Is foam insulation toxic?,0
1306120,ffffec519fa37cf60c78,How can one start a research project based on ...,0


## Text Preprocessing

### Tokenization

In [5]:
import nltk
from nltk.tokenize import word_tokenize

In [6]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Removing Stop Words

In [7]:
from nltk.corpus import stopwords

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
english_stopwords = stopwords.words('english')

In [10]:
", ".join(english_stopwords)

"i, me, my, myself, we, our, ours, ourselves, you, you're, you've, you'll, you'd, your, yours, yourself, yourselves, he, him, his, himself, she, she's, her, hers, herself, it, it's, its, itself, they, them, their, theirs, themselves, what, which, who, whom, this, that, that'll, these, those, am, is, are, was, were, be, been, being, have, has, had, having, do, does, did, doing, a, an, the, and, but, if, or, because, as, until, while, of, at, by, for, with, about, against, between, into, through, during, before, after, above, below, to, from, up, down, in, out, on, off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mus

### Stemming

In [11]:
from nltk.stem.snowball import SnowballStemmer

In [12]:
stemmer = SnowballStemmer(language='english')

In [13]:
def tokenize(text):
  return [stemmer.stem(word) for word in word_tokenize(text) if word.lower() not in english_stopwords]

### Count Vectorizer

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
vectorizer = CountVectorizer(lowercase=True, tokenizer=tokenize, stop_words=english_stopwords, max_features=1000)

In [16]:
%%time
vectorizer.fit(df['question_text'])



CPU times: user 9min 9s, sys: 682 ms, total: 9min 9s
Wall time: 9min 10s


In [17]:
vectorizer.get_feature_names_out()[:100]

array(['!', '$', '%', '&', "'", "''", "'m", "'s", '(', ')', ',', '-', '.',
       '1', '10', '100', '12', '12th', '15', '2', '20', '2017', '2018',
       '3', '4', '5', '6', '7', '8', ':', '?', '[', ']', '``', 'abl',
       'abroad', 'abus', 'accept', 'access', 'accomplish', 'accord',
       'account', 'achiev', 'acid', 'act', 'action', 'activ', 'actor',
       'actual', 'ad', 'add', 'address', 'admiss', 'adult', 'advanc',
       'advantag', 'advic', 'affect', 'africa', 'african', 'age', 'ago',
       'air', 'allow', 'almost', 'alon', 'alreadi', 'also', 'altern',
       'alway', 'amazon', 'america', 'american', 'among', 'amount',
       'analysi', 'android', 'anim', 'anoth', 'answer', 'anyon', 'anyth',
       'apart', 'app', 'appear', 'appl', 'appli', 'applic', 'approach',
       'arab', 'area', 'arm', 'armi', 'around', 'art', 'asian', 'ask',
       'associ', 'atheist', 'attack'], dtype=object)

In [18]:
%%time
inputs = vectorizer.transform(df['question_text'])

CPU times: user 9min 11s, sys: 316 ms, total: 9min 11s
Wall time: 9min 11s


In [19]:
%%time
test_inputs = vectorizer.transform(test_df['question_text'])

CPU times: user 2min 39s, sys: 32 ms, total: 2min 39s
Wall time: 2min 39s


In [20]:
print(f"Train Input Shape: {inputs.shape}")
print(f"Test Input Shape: {test_inputs.shape}")

Train Input Shape: (1306122, 1000)
Test Input Shape: (375806, 1000)


## Machine Learning for Text Classification

### Create training and validation set

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
train_inputs, val_inputs, train_targets, val_targets = train_test_split(inputs, df['target'], test_size=0.3, random_state=42)

In [23]:
print(f"Train Input Shape: {train_inputs.shape}")
print(f"Validation Input Shape: {val_inputs.shape}")
print(f"Train Target Shape: {train_targets.shape}")
print(f"Validation Target Shape: {val_targets.shape}")

Train Input Shape: (914285, 1000)
Validation Input Shape: (391837, 1000)
Train Target Shape: (914285,)
Validation Target Shape: (391837,)


### Logistic Regression Model

In [24]:
from sklearn.linear_model import LogisticRegression

In [25]:
model = LogisticRegression(max_iter=1000, solver='sag')

In [26]:
model.fit(train_inputs, train_targets)

In [27]:
train_preds = model.predict(train_inputs)

In [28]:
val_preds= model.predict(val_inputs)

### Accuracy and F1 Score

In [29]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [30]:
print(f"Training Accuracy {accuracy_score(train_targets, train_preds)}")
print(f"Training F1 Score {f1_score(train_targets, train_preds)}")

Training Accuracy 0.945431676118497
Training F1 Score 0.38581329787888863


In [31]:
print(f"Validation Accuracy {accuracy_score(val_targets, val_preds)}")
print(f"Validation F1 Score {f1_score(val_targets, val_preds)}")

Validation Accuracy 0.9462429530646672
Validation F1 Score 0.383949461862424


### Making Predictions for Kaggle

In [32]:
test_df

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?
...,...,...
375801,ffff7fa746bd6d6197a9,How many countries listed in gold import in in...
375802,ffffa1be31c43046ab6b,Is there an alternative to dresses on formal p...
375803,ffffae173b6ca6bfa563,Where I can find best friendship quotes in Tel...
375804,ffffb1f7f1a008620287,What are the causes of refraction of light?


In [33]:
test_preds = model.predict(test_inputs)

In [34]:
submission_df

Unnamed: 0,qid,prediction
0,0000163e3ea7c7a74cd7,0
1,00002bd4fb5d505b9161,0
2,00007756b4a147d2b0b3,0
3,000086e4b7e1c7146103,0
4,0000c4c3fbe8785a3090,0
...,...,...
375801,ffff7fa746bd6d6197a9,0
375802,ffffa1be31c43046ab6b,0
375803,ffffae173b6ca6bfa563,0
375804,ffffb1f7f1a008620287,0


In [35]:
submission_df['prediction'] = test_preds

In [36]:
submission_df['prediction'].value_counts()

0    365946
1      9860
Name: prediction, dtype: int64

In [37]:
submission_df.to_csv('submission.csv', index=None)

### The End