# Importing the essential library and reading the json file

In [1]:
import json
import pandas as pd
import numpy as np 
with open('men.json','r') as fd:
    js = json.load(fd)
    fd.close()

# Creating the .csv file from the json file

In [2]:
keys       = [] 
values     = []
links      = []

for i in js['products'].keys():
    links.append('https://www.myntra.com/'+i)   

for i in js['products'].keys():
    values.append(js['products'][i].values())
    keys.append(js['products'][i].keys())

rand_df = pd.DataFrame(values,columns = ['name', 'brand', 'size', 'image', 'actual_price', 'discount_price', 'discount_percentage', 'sub_category'])
rand_df['product_links'] = links
rand_df.to_csv('Myntra_men.csv',index = False)

In [3]:
# Dataset Walkthrough
df = pd.read_csv('Myntra_men.csv')
df.head()

Unnamed: 0,name,brand,size,image,actual_price,discount_price,discount_percentage,sub_category,product_links
0,Printed Casual Shirt,Roadster,40,"https://assets.myntassets.com/dpr_2,q_60,w_210...",Rs. 1499,Rs. 598,(Rs. 901 OFF),"['Topwear', 'Casual Shirts']",https://www.myntra.com/shirts/roadster/roadste...
1,Slim Fit Casual Shirt,HIGHLANDER,42,"https://assets.myntassets.com/dpr_2,q_60,w_210...",Rs. 1049,Rs. 524,(50% OFF),"['Topwear', 'Casual Shirts']",https://www.myntra.com/shirts/highlander/highl...
2,Slim Fit Casual Shirt,HERE&NOW,38,"https://assets.myntassets.com/dpr_2,q_60,w_210...",Rs. 1899,Rs. 699,(Rs. 1200 OFF),['Topwear'],https://www.myntra.com/shirts/herenow/herenow-...
3,Men Slim Fit Casual Shirt,The Indian Garage Co,40,"https://assets.myntassets.com/dpr_2,q_60,w_210...",Rs. 1649,Rs. 544,(67% OFF),"['Topwear', 'Casual Shirts']",https://www.myntra.com/shirts/the-indian-garag...
4,Relaxed Fit Hoodie,H&M,XS,"https://assets.myntassets.com/dpr_2,q_60,w_210...",Rs. 1499,Rs. 899,(40% OFF),['Topwear'],https://www.myntra.com/sweatshirts/hm/hm-men-r...


In [4]:
# checking the basic information about the dataset 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229313 entries, 0 to 229312
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   name                 229313 non-null  object
 1   brand                229313 non-null  object
 2   size                 229200 non-null  object
 3   image                45136 non-null   object
 4   actual_price         196073 non-null  object
 5   discount_price       196073 non-null  object
 6   discount_percentage  196073 non-null  object
 7   sub_category         229313 non-null  object
 8   product_links        229313 non-null  object
dtypes: object(9)
memory usage: 15.7+ MB


In [5]:
# describing the data 
df.describe()
# so we can conclude that the following are the columns where arithematic operations can be performed 

Unnamed: 0,name,brand,size,image,actual_price,discount_price,discount_percentage,sub_category,product_links
count,229313,229313,229200,45136,196073,196073,196073,229313,229313
unique,21433,919,58,45132,1247,2538,695,14,229313
top,Slim Fit Casual Shirt,Roadster,S,"https://assets.myntassets.com/dpr_2,q_60,w_210...",Rs. 1999,Rs. 899,(50% OFF),['Topwear'],https://www.myntra.com/shirts/roadster/roadste...
freq,10725,8126,62289,2,12391,5124,23460,126690,1


In [6]:
# checking the null values
df.isnull().sum()

name                        0
brand                       0
size                      113
image                  184177
actual_price            33240
discount_price          33240
discount_percentage     33240
sub_category                0
product_links               0
dtype: int64

# pre-processing the actual price 

In [7]:
price = []
for p in df['actual_price']:
    if type(p) == str:
        price.append(p[4:])                  # removing the price tag (Rs. )
    else:
        price.append(p)
        
df['actual_price'] = price

# pre-processing the discount price

In [8]:
d_price = []
for p in df['discount_price']:
    if type(p) == str:
        d_price.append(p[4:])               # removing the price tag (Rs. )
    else:
        d_price.append(p)
        
df['discount_price'] = d_price

# pre-processing the discount percentage

In [9]:
dis_per = []
for i in df['discount_percentage']:
    if type(i) == str:
        dis_per.append(i[1:-4])                  # removing the unwanted things (removing the brackets in the front and back )
    else:
        dis_per.append(i)
        
df['discount_percentage'] = dis_per   

# pre-processing the sub-category

In [10]:
sub = []
for i in df['sub_category']:
    if len(i.split(',')) == 1:
        sub.append(i[2:-2])                       # removing the unwanted things(removing the brackets in the front and back )
    else:
        sub.append(i[2:-2].replace("', '",','))   # removing the unwanted things and replacing with the comma
        
df['sub_category'] = sub  

In [11]:
df.head(5)

Unnamed: 0,name,brand,size,image,actual_price,discount_price,discount_percentage,sub_category,product_links
0,Printed Casual Shirt,Roadster,40,"https://assets.myntassets.com/dpr_2,q_60,w_210...",1499,598,Rs. 901,"Topwear,Casual Shirts",https://www.myntra.com/shirts/roadster/roadste...
1,Slim Fit Casual Shirt,HIGHLANDER,42,"https://assets.myntassets.com/dpr_2,q_60,w_210...",1049,524,50%,"Topwear,Casual Shirts",https://www.myntra.com/shirts/highlander/highl...
2,Slim Fit Casual Shirt,HERE&NOW,38,"https://assets.myntassets.com/dpr_2,q_60,w_210...",1899,699,Rs. 1200,Topwear,https://www.myntra.com/shirts/herenow/herenow-...
3,Men Slim Fit Casual Shirt,The Indian Garage Co,40,"https://assets.myntassets.com/dpr_2,q_60,w_210...",1649,544,67%,"Topwear,Casual Shirts",https://www.myntra.com/shirts/the-indian-garag...
4,Relaxed Fit Hoodie,H&M,XS,"https://assets.myntassets.com/dpr_2,q_60,w_210...",1499,899,40%,Topwear,https://www.myntra.com/sweatshirts/hm/hm-men-r...


In [12]:
# further processing the discount_percentage 

dis_percent = []
for i in df.values:
    if type(i[4]) == str and type(i[5]) == str:
        dis_percent.append(round((float(i[4]) - float(i[5]))/float(i[4])*100))  # calculation of discount percentage
    else:
        dis_percent.append(np.nan)
    
df['discount_percentage'] = dis_percent

In [13]:
# increasing the image size 

In [14]:
data = []
for i in df.values:
    if type(i[3]) == float:
        data.append(i[3])
    elif 'dpr_2' in (i[3]) and type(i[3]) == str:
        data.append(i[3].replace('dpr_2','dpr_2.5'))

In [15]:
df['image'] = data


In [16]:
df.head()

Unnamed: 0,name,brand,size,image,actual_price,discount_price,discount_percentage,sub_category,product_links
0,Printed Casual Shirt,Roadster,40,"https://assets.myntassets.com/dpr_2.5,q_60,w_2...",1499,598,60.0,"Topwear,Casual Shirts",https://www.myntra.com/shirts/roadster/roadste...
1,Slim Fit Casual Shirt,HIGHLANDER,42,"https://assets.myntassets.com/dpr_2.5,q_60,w_2...",1049,524,50.0,"Topwear,Casual Shirts",https://www.myntra.com/shirts/highlander/highl...
2,Slim Fit Casual Shirt,HERE&NOW,38,"https://assets.myntassets.com/dpr_2.5,q_60,w_2...",1899,699,63.0,Topwear,https://www.myntra.com/shirts/herenow/herenow-...
3,Men Slim Fit Casual Shirt,The Indian Garage Co,40,"https://assets.myntassets.com/dpr_2.5,q_60,w_2...",1649,544,67.0,"Topwear,Casual Shirts",https://www.myntra.com/shirts/the-indian-garag...
4,Relaxed Fit Hoodie,H&M,XS,"https://assets.myntassets.com/dpr_2.5,q_60,w_2...",1499,899,40.0,Topwear,https://www.myntra.com/sweatshirts/hm/hm-men-r...


In [17]:
# finally saving the file 

In [18]:
df.to_csv('Final_Myntra.csv',index = False)