### Load lib

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

### Check data

In [2]:
data = pd.read_csv('../data/data_v2_19_oct.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,pair_id,name_1,name_2,is_duplicate,name_1_upd,name_2_upd
0,0,1,Iko Industries Ltd.,"Enormous Industrial Trade Pvt., Ltd.",0,Iko,Enormous Trade
1,1,2,Apcotex Industries Ltd.,Technocraft Industries (India) Ltd.,0,Apcotex,Technocraft India
2,2,3,"Rishichem Distributors Pvt., Ltd.",Dsa,0,Rishichem Distributors,Dsa
3,3,4,Powermax Rubber Factory,Co. One,0,Powermax Factory,One
4,4,5,Tress A/S,Longyou Industries Park Zhejiang,0,Tress,Longyou Park Zhejiang


In [4]:
data.name_1_upd = data.name_1_upd.apply(lambda comp_name: ''.join([alpha if alpha.isalpha() or alpha==' ' else "" for alpha in comp_name]))
data.name_2_upd = data.name_2_upd.apply(lambda comp_name: ''.join([alpha if alpha.isalpha() or alpha==' ' else "" for alpha in comp_name]))

In [5]:
tfidf = TfidfVectorizer(max_df=0.85, min_df=5, max_features = 400)

In [6]:
train, test = train_test_split(data, test_size=0.4, random_state=42, stratify= data.is_duplicate)

In [7]:
data_to_tfidf = pd.concat([train.name_1_upd, train.name_2_upd])

In [8]:
tfidf.fit(data_to_tfidf)

TfidfVectorizer(max_df=0.85, max_features=400, min_df=5)

In [9]:
train_name_1 = pd.DataFrame(tfidf.transform(train.name_1_upd).toarray())
train_name_2 = pd.DataFrame(tfidf.transform(train.name_2_upd).toarray())

In [10]:
train_name_data = pd.concat([train_name_1, train_name_2], axis=1, ignore_index=False)

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [15]:
model = LogisticRegression(verbose=1, n_jobs=4)

In [16]:
%%time
model.fit(train_name_data, train.is_duplicate)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed: 16.4min finished


CPU times: total: 5.95 s
Wall time: 16min 31s


LogisticRegression(n_jobs=4, verbose=1)

In [17]:
f1_score(train.is_duplicate, model.predict(train_name_data))

0.6400714498362609

In [18]:
print(classification_report(train.is_duplicate, model.predict(train_name_data)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    296496
           1       0.92      0.49      0.64      2195

    accuracy                           1.00    298691
   macro avg       0.96      0.74      0.82    298691
weighted avg       1.00      1.00      1.00    298691



In [19]:
test_name_1 = pd.DataFrame(tfidf.transform(test.name_1_upd).toarray())
test_name_2 = pd.DataFrame(tfidf.transform(test.name_2_upd).toarray())

In [20]:
test_name_data = pd.concat([test_name_1, test_name_2], axis=1, ignore_index=False)

In [21]:
f1_score(test.is_duplicate, model.predict(test_name_data))

0.6294461954074742

In [22]:
print(classification_report(test.is_duplicate, model.predict(test_name_data)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    197665
           1       0.92      0.48      0.63      1463

    accuracy                           1.00    199128
   macro avg       0.96      0.74      0.81    199128
weighted avg       1.00      1.00      1.00    199128

