In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA

In [None]:
path = '/content/drive/MyDrive/Data Science/Project-39 Clothing Fit-Size predictions/cloth_yelp.json'
df= pd.read_json(path, lines=True)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.drop(['item_id', 'user_id', 'review_summary', 'review_text', 'user_name'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.isna().sum()

### filling waist null values

In [None]:
waist_cor = df.groupby('size')['waist'].mean()
waist_cor

In [None]:
def fillWaist():
  x = []
  for i in range(len(df['waist'])):
    if math.isnan(df['waist'][i]):
      x.append(round(waist_cor[df['size'][i]], 1))
    else:
      x.append(df['waist'][i])
  return x

In [None]:
df['waist'] = fillWaist()

In [None]:
df.head()

### Height

In [None]:
df['height'] = df['height'].str.replace('ft', '.').str.replace('in', '').str.replace(' ', '')

In [None]:
df['height'] = df['height'].apply(lambda x: np.float64(x) if x != 'NaN' or x != 'nan' else 'NaN')

### Bust

In [None]:
df['bust'] = df['bust'].replace('37-39', '38').apply(lambda x: np.float64(x))

In [None]:
bust = df.groupby('size')['bust'].median()
bust

In [None]:
def fill_bust():
  x = []
  for i in range(len(df['bust'])):
    if math.isnan(df['bust'][i]):
      x.append(round(bust[df['size'][i]], 1))
    else:
      x.append(df['waist'][i])
  return x

In [None]:
df['bust'] = fill_bust()

### dropping Null values

In [None]:
df.dropna(inplace=True)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
obj_col = df.select_dtypes('object').columns

In [None]:
for i in obj_col:
  print(i)
  print(df[i].value_counts())
  print('---'*20)

In [None]:
for i in obj_col:
  encoder = LabelEncoder().fit(df[i])
  df[i] = encoder.transform(df[i])
  print(i)
  print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))
  print('----'*20)

In [None]:
df.head()

## Checking outlayers and corration

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(13, 13))
sns.heatmap(df.corr(), annot=True, cmap='viridis', linewidths=0.8)
plt.show()

In [None]:
sns.boxplot(data=df, x='waist')

In [None]:
sns.boxplot(data=df, x='size')

In [None]:
sns.boxplot(data=df, x='hips')

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
x = df.drop(['waist', 'quality', 'hips'], axis=1)
y = df['quality'].values

In [None]:
scaler = StandardScaler().fit(x)
x = scaler.transform(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=111)

In [None]:
# import models
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
all_model = [LogisticRegression, RandomForestClassifier, DecisionTreeClassifier]

In [None]:
def getModel_score(x):
  model = x()
  model.fit(x_train, y_train)
  print(type(model).__name__)
  print(model.score(x_train, y_train))
  print(model.score(x_test, y_test))


In [None]:
for i in all_model:
  getModel_score(i)