# SGDClassifier with Pipeline

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('input/train.csv')
df = df.dropna(how="any").reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404288 entries, 0 to 404287
Data columns (total 6 columns):
id              404288 non-null int64
qid1            404288 non-null int64
qid2            404288 non-null int64
question1       404288 non-null object
question2       404288 non-null object
is_duplicate    404288 non-null int64
dtypes: int64(4), object(2)
memory usage: 18.5+ MB


In [3]:
df_test = pd.read_csv("input/test.csv", low_memory=False, iterator=True, chunksize=600000)
df_test = pd.concat(df_test, ignore_index=True)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2345796 entries, 0 to 2345795
Data columns (total 3 columns):
test_id      int64
question1    object
question2    object
dtypes: int64(1), object(2)
memory usage: 53.7+ MB


## append vs concat

In [4]:
%time all_questions = pd.concat([df_test['question1'], df_test['question2']])

CPU times: user 90.9 ms, sys: 43.1 ms, total: 134 ms
Wall time: 134 ms


In [5]:
%time all_questions = df_test['question1'].append(df_test['question2'])

CPU times: user 128 ms, sys: 45.8 ms, total: 174 ms
Wall time: 172 ms


## Feature Engineering with parameter tuning

- TfidfVectorizer, CountVectorizer
- Demenstionaly Reduction with LSA
- Parameter Tuning with GridSearch

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import pairwise_distances
from sklearn.pipeline import Pipeline

In [7]:
data = pd.concat([df['question1'], df['question2']])
target = df['is_duplicate']

In [8]:
tfidf = TfidfVectorizer(stop_words='english', binary=True)
svd_model = TruncatedSVD(n_components=500, algorithm='randomized', n_iter=10, random_state=42)

In [9]:
svd_transformer = Pipeline([('tfidf', tfidf), ('svd', svd_model)])

In [None]:
%time svd_model = svd_transformer.fit(data)

In [None]:
question1 = svd_model.transform(df['question1'])
question2 = svd_model.transform(df['question2'])

In [None]:
distance_matrix = pairwise_distances(question1, question2, metric='cosine', n_jobs=-1)