# Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np
import time

import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing


# Converting Json data to DataFrame

In [None]:
data = pd.read_json('computers_train_xlarge.json',lines = True)
data.to_csv('train.csv',index = False)
df = pd.read_csv('train.csv')
df


In [None]:
df.columns

# Data Cleaning : Removing Unwanted Columns

In [None]:
df.drop(df.columns[[2,5,8,9,12,13,14,15,16,17]],axis=1,inplace =True)
df

# Data Analysis 

In [None]:
df.info()

In [None]:
df['category_left'].value_counts()

In [None]:
df['category_right'].value_counts()


In [None]:
df['id_left'].value_counts()


In [None]:
(df['id_left'] < 0).sum()

In [None]:
df['label'].value_counts()

In [None]:
df.drop(df.index[df['label'] == 0],inplace = True)

In [None]:
df['label'].value_counts()

In [None]:
df['description_left'].isnull().sum()

# Filling the Null values of Description Columns by Title

In [None]:
df.description_left.fillna(df.title_left, inplace = True)
df['description_left']

In [None]:
df['description_left'].isnull().sum()

In [None]:
df['description_right'].isnull().sum()

In [None]:
df.description_right.fillna(df.title_left, inplace = True)
df['description_right']

In [None]:
df['description_right'].isnull().sum()

# Comparing category left and category right

In [None]:
df['category_match'] = np.where(df['category_left'] == df['category_right'],1,0)
df.head()

#1-Match / 0-Unmatch


In [None]:
df['category_match'].value_counts()


In [None]:
df.info()

In [None]:
df.drop(df.index[df['category_match'] == 0],inplace = True)

In [None]:
df['category_match'].value_counts()

In [None]:
newdf=df.copy()
newdf.head()

# Applying Maching Numbers function to match the products Features 

In [None]:
def matching_numbers(description_right, description_left):

    description_right = set(re.findall(r'[0-9]+', description_right))
    description_left = set(re.findall(r'[0-9]+', description_left))    
    union = description_right.union(description_left)
    intersection = description_right.intersection(description_left)

    if len(description_right)==0 and len(description_left) == 0:
        return 1
    else:
        return (len(intersection)/ len(union))


# Implementing Levenshtein Text similarity 

In [None]:

import jellyfish as jf
def engineer_features(df):
    
    df['description_left'] = df['description_left'].str.lower()
    df['description_right'] = df['description_right'].str.lower()

    df['levenshtein_distance'] = df.apply(
    lambda x: jf.levenshtein_distance(x['description_right'], 
                                      x['description_left']), axis=1)

    df['matching_numbers'] = df.apply(
    lambda x: matching_numbers(x['description_right'], 
                               x['description_left']), axis=1)

    df['matching_numbers_log'] = (df['matching_numbers']+1).apply(np.log)
    
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(value=0, inplace=True)

    return df


# All Required Features 

In [None]:
df.head()

In [None]:
df = engineer_features(df)
df = df[['description_left','description_right','levenshtein_distance','matching_numbers']]
df

In [None]:
df=df.sort_values(by=['levenshtein_distance'], ascending=[False])
df

# Examining the mean ,count and max values of columns

In [None]:
df.describe()

In [None]:
df['match'] = np.where(((df['levenshtein_distance']<60) & (df['matching_numbers']>0.7)),1,0)
df['match'].value_counts()

In [None]:
df

# Applying Validation set For Finding Model Accuracy

In [None]:
data1 = pd.read_json('computers_gs.json',lines = True)
data1.to_csv('test.csv',index = False)
df_test = pd.read_csv('test.csv')
df_test.info()

In [None]:
df_test.columns


In [None]:
df_test.drop(df_test.columns[[2,5,8,9,12,13,14,15,16,17]],axis=1,inplace =True)
df_test


In [None]:
df_test['description_right'].isnull().sum()

In [None]:
df_test.description_right.fillna(df_test.title_left, inplace = True)
df_test['description_right']

In [None]:
df_test['description_right'].isnull().sum()

In [None]:
df_test['description_left'].isnull().sum()

In [None]:
df_test.description_left.fillna(df_test.title_left, inplace = True)
df_test['description_left']

In [None]:
df_test['description_left'].isnull().sum()

In [None]:
df_test['category_match'] = np.where(df_test['category_left'] == df_test['category_right'],1,0)
df_test

In [None]:
df_test['category_match'].value_counts()

In [None]:
df_test.drop(df_test.index[df_test['category_match'] == 0],inplace = True)

In [None]:
df_test['category_match'].value_counts()

In [None]:
newdf_test=df_test.copy()
newdf_test.head()

In [None]:
df_test = engineer_features(df_test)
df_test = df_test[['description_left','description_right','levenshtein_distance','matching_numbers','label']]
df_test

In [None]:
df_test['match'] = np.where(((df_test['levenshtein_distance']<40) & (df_test['matching_numbers']>0.4)),1,0)
df_test['match'].value_counts()

In [None]:
X_train=df[['levenshtein_distance','matching_numbers']]
X_test=df_test[['levenshtein_distance','matching_numbers']]
y_train=df['match']
y_test=df_test['match']
y_train

# Creating Function For Finding Confusion Matrix

In [None]:
def get_confusion_matrix_values(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    return(cm[0][0], cm[0][1], cm[1][0], cm[1][1])

# Model Building: Decision Tree,Random Forest and Support Vector Classifier

In [None]:
from sklearn.svm import SVC
classifiers = {
    "DecisionTreeClassifier":DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=1, min_samples_split=4,random_state=42),
    "Support Vector Classifier":SVC(kernel='rbf', gamma=0.1),
    "RandomForestClassifier":RandomForestClassifier(n_estimators=1000,max_depth=4,random_state=42,n_jobs=-1),

}


df_results = pd.DataFrame(columns=['model', 'accuracy', 'precision',
                                   'true_pos','false_pos',
                                   'true_neg','false_neg','recall','f1'])

for key in classifiers:

    classifier = classifiers[key]
    model = classifier.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    classification = classification_report(y_test, y_pred, zero_division=0)
    tp, fp, fn, tn = get_confusion_matrix_values(y_test, y_pred)

    row = {'model': key,
           'accuracy': accuracy,
           'precision': precision,
              'Recall': recall,
           'f1': f1,
           'true_pos': tp,
           'false_pos': fp,
           'true_neg': tn,
           'false_neg': fn,
          }
    df_results = df_results.append(row, ignore_index=True)

df_results.head(10)

# Estimating Results in Binary

In [None]:
results = pd.DataFrame(data={'predictions': y_pred, 'actual': y_test})
results['result'] = np.where(results['predictions']==results['actual'], 1, 0)
results


# Final Results with Match and Not Matched Classification

In [None]:
results['predictions'].replace(0, 'Not match',inplace=True)
results['predictions'].replace(1, 'Match',inplace=True)

results['actual'].replace(0, 'Not Match',inplace=True)
results['actual'].replace(1, 'Match',inplace=True)

results['result'].replace(0, 'False',inplace=True)
results['result'].replace(1, 'True',inplace=True)
results