In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn import preprocessing
import scipy
from scipy import stats
from scipy.stats import skewnorm
from scipy.stats import norm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

%matplotlib inline
sns.set(color_codes=True)

# Data Description

In [None]:
Products_data=pd.read_csv('/kaggle/input/data-analysis-products-dataset/ProductsData.csv',encoding='latin1')
Products_data.index=range(1,len(Products_data)+1)
Products_data.head()

In [None]:
Products_data.shape

In [None]:
Products_data.info()

In [None]:
Products_data.describe()

# Data Cleaning

**Removing all the double quotes from each element in the features**

In [None]:
for i,col in enumerate(Products_data.columns):
    Products_data.iloc[:,i]=Products_data.iloc[:,i].str.replace('"','')
Products_data.head()

**Replacing all empty string values with NaN**

In [None]:
Products_data=Products_data.replace(r'^\s*$',np.NaN,regex=True)
Products_data.head()

**Replacing all the unwanted strings with NaN**

In [None]:
Products_data['Product_name'] = Products_data['Product_name'].apply(lambda x: np.nan if str(x).find('?')>-1 else x)
Products_data.head()

**Removing all the unwanted space in price column**

In [None]:
Products_data['price'] = Products_data['price'].str.replace(' ', '')
Products_data.head()

In [None]:
print(Products_data.isnull().sum())

**The Price column having missing values can be used as test set, which will we evaluated by the model.**
**The rows with product name as missing values need to be removed, as they will not be useful in evaluation, as they are meaningless.**

In [None]:
i=Products_data['Product_name'].isnull()
data=Products_data[i]
data.shape

In [None]:
Products_data=Products_data.drop(data.index,axis=0)
Products_data.shape

### Separating Test Data from the original Product Dataset

In [None]:
test_data=Products_data[Products_data.isnull().any(axis=1)]
test_data.shape

In [None]:
test_data.head()

In [None]:
test_data.index=range(1,len(test_data)+1)
test_data=test_data.drop('price',axis=1)
test_data.head()

In [None]:
data=test_data.drop_duplicates(subset=None,keep='first',inplace=False)
print('Shape of Original dataset:',test_data.shape)
print('Shape of dataset after dropping duplicates:',data.shape)

#### Since, Rows in the test dataset after dropping duplicates is less than the original test dataset, so there exists some duplicates in the given dataset, so, we will drop those for further evaluation.

In [None]:
test_data=data
test_data.shape

### Dropping the rows with missing values in target variable

In [None]:
Products_data.dropna(axis=0, subset=['price'], inplace=True)

In [None]:
Products_data.describe()

In [None]:
print(Products_data.isnull().sum())

In [None]:
Products_data.shape

In [None]:
Products_data.index=range(1,len(Products_data)+1)

In [None]:
without_duplicate_data=Products_data.drop_duplicates(subset=None,keep='first',inplace=False)

print('Shape of Original dataset:',Products_data.shape)
print('Shape of dataset after dropping duplicates:',without_duplicate_data.shape)

#### Since, Rows in the Product dataset after dropping duplicates is less than the original Product dataset, so there exists some duplicates in the given dataset, so, we will drop those for further evaluation.

In [None]:
Products_data=without_duplicate_data
Products_data.shape

In [None]:
Products_data.describe()

# Data Visualisation

In [None]:
Products_data['price']=pd.to_numeric(Products_data['price'], errors='coerce')
Products_data['Product_id']=pd.to_numeric(Products_data['Product_id'], errors='coerce')

In [None]:
Products_data=Products_data.sort_values('price')

In [None]:
plt.figure(figsize=(15,30))
sns.swarmplot(Products_data['price'],Products_data['Product_Category'])
plt.xscale('log')

**Appartements** and **Maisons et villas** are the most expensive products category. **Voitures** product category seems to have the most number of expensive items. Many of the product category contains **outliers** due to variation in prices of the different product category.

In [None]:
plt.figure(figsize=(20,10))
sns.swarmplot(Products_data['Professional_Publication'],Products_data['price'])
plt.yscale('log')

The **private publications** seems to have more data items towards the more expensive prices products and even towards the more cheap prices products and also contains more number of **outliers** than that of **pro publications** product.

In [None]:
plt.figure(figsize=(15,30))
sns.boxplot(Products_data['price'],Products_data['Product_Category'])
plt.xscale('log')

Most of the products contain **outliers** because of more variation in cost price demand of the products.

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(Products_data['Professional_Publication'],Products_data['price'])
plt.yscale('log')

Most costliest products sold are of **private publications** than **pro publications**. More number of **outliers** are present in **private publications**.

In [None]:
Products_data['Professional_Publication'].value_counts()

In [None]:
sns.countplot(Products_data['Professional_Publication'])

More number of **private publication** products are sold than **pro publication** products.

In [None]:
Products_data['Product_Category'].value_counts()

In [None]:
plt.figure(figsize=(10,20))
sns.countplot(y=Products_data['Product_Category'],hue=Products_data['Professional_Publication'])

**Voitures** category of product of **private publications** are more sold and **Ordinateurs de bureau** category of **pro publication** products are more sold. Overall, **Voitures** category products are sold the most. **Films, livres, magazines** category of products are sold the least.

In [None]:
Products_data['Region_address'].value_counts()

In [None]:
plt.figure(figsize=(10,10))
sns.countplot(y=Products_data['Region_address'],hue=Products_data['Professional_Publication'])

In [None]:
sns.distplot(Products_data['price'],kde=True,fit=norm,rug=True)

In [None]:
print('Minimum Price:',Products_data.price.min())
print('Maximum Price:',Products_data.price.max())

In [None]:
i=Products_data.price>0
Original=Products_data.price.loc[i]
Normalized=pd.Series(stats.boxcox(Original)[0],name='price',index=Original.index)

sns.distplot(Normalized,rug=True,fit=norm)
plt.title('Normalized data')

The dataset of prices are a little bit of **right skewed normalised**.

In [None]:
plt.figure(figsize=(25,100))
sns.countplot(y=Products_data['price'])

High demand of product having cost price=300 and most of the products sold are of prices in the range 100 to 10000. Less demand of products with high price.

In [None]:
plt.figure(figsize=(15,30))
sns.countplot(y=Products_data['Local_address'],hue=Products_data['Professional_Publication'])

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(data=Products_data.groupby(['Region_address'])['price'].sum().reset_index().sort_values(by=['price'], ascending=False),x='price',y='Region_address')

Both the **pro** and **private** products are most sold by sellers in **Grand Casablanca**. Even the most costliest product are sold by sellers in **Grand Casablanca**. Least number of products are sold by sellers of **Laâyoune-Boujdour-Sakia El Hamra** region.