#  4.Quora Insincere Questions Classification  
### SVM

### 4.1 Import necessary libraries

In [1]:
import string
import os
import math

import pandas as pd
import numpy as np
import nltk

from nltk.util import ngrams
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [6]:
# Parameters and definitions
RANDOM_SEED = 0
VAL_SET_SIZE = 0.2

In [7]:
# Download essential resources
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

In [8]:
np.random.seed(RANDOM_SEED)

### File Paths

In [9]:
DATA_DIR = "../input/"
TRAIN_SAMPLES = DATA_DIR+"train.csv"
TEST_SAMPLES = DATA_DIR+"test.csv"
EMBD_SAMPLES = DATA_DIR+"embeddings.zip"
MODEL_OUT = "model-svm.pkl"

###  4.2 Vectorization

In [17]:
def build_TF(dt_train, dt_test):
    """Builds the TF-IDF matrix."""
    max_features = 50000  # More than this would filter in noise also
    tfidf_vectorizer = TfidfVectorizer(ngram_range =(2,4) , max_df=0.90, min_df=5, max_features=max_features)
    X = tfidf_vectorizer.fit_transform(dt_train['question_text'])
    X_test = tfidf_vectorizer.transform(dt_test['question_text'])
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    y = dt_train["target"]
    return [train_test_split(X, y, test_size=VAL_SET_SIZE), X_test]

In [18]:
tfvect = build_TF(df_train, df_test)
X_train, X_val, y_train, y_val = tfvect[0]
X_test = tfvect[1]

### 4.3 Machine Learning

In [19]:
#Builds an SVM model.
def build_model(dt_train, dt_test):
    return LinearSVC(C=0.5).fit(X_train, y_train)

In [20]:
# Build the model
svm_model = build_model(df_train, df_test)

In [21]:
# Produce predictions
y_pred_train = svm_model.predict(X_train)
y_pred_val = svm_model.predict(X_val)
y_pred_test = svm_model.predict(X_test)

### Evaluation

In [22]:
#Produces a report containing the accuracy, f1-score, precision and recall metrics.
def produce_metrics(y, y_pred):
    print("Accuracy: {}, F1 Score: {}, Precision: {}, Recall: {}".format(accuracy_score(y, y_pred),
                                                                     f1_score(y, y_pred, average="macro"),
                                                                     precision_score(y, y_pred, average="macro"),
                                                                     recall_score(y, y_pred, average="macro")))

In [23]:
#Produces a classification report
def produce_classification_report(y, y_pred):
    print(classification_report(y, y_pred))

In [24]:
produce_metrics(y_train, y_pred_train)

Accuracy: 0.9483020814491764, F1 Score: 0.6811086922665801, Precision: 0.8397256106696354, Recall: 0.6296225894814238


In [25]:
produce_metrics(y_val, y_pred_val)

Accuracy: 0.9446262800267968, F1 Score: 0.6550135328565445, Precision: 0.7951687416803697, Recall: 0.6110502050917679


In [26]:
produce_classification_report(y_val, y_pred_val)

              precision    recall  f1-score   support

           0       0.95      0.99      0.97    245149
           1       0.64      0.23      0.34     16076

   micro avg       0.94      0.94      0.94    261225
   macro avg       0.80      0.61      0.66    261225
weighted avg       0.93      0.94      0.93    261225



## In this notebook We built SVM model.We will explore LSTM in the next notebook