# Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Loading the data

In [None]:
raw_data=pd.read_csv('/kaggle/input/windows-store/msft.csv')

In [None]:
raw_data.head(10)

In [None]:
raw_data.tail(10)

In [None]:
raw_data.describe(include='all')

## Checking Missing values

In [None]:
raw_data.isnull().sum()

In [None]:
data=raw_data.dropna()

In [None]:
data.describe(include='all')

# Let us first check each column one bye one

## Rating column

### This will give us the fact about how well the customers are satisfied
##### As we know the reputation of the Microsoft company, so we can assume that ration must be high for most products

In [None]:
data['Rating'].value_counts()

In [None]:
ae=sns.countplot(x='Rating',data=data)

## From this graph we can easily conclude that our hypothesis is true , as most of the ratings are 4-5 range

# Category column

## From this analysis we will know that which type of products are in the store

In [None]:
data['Category'].value_counts()

In [None]:
plt.figure(figsize=(35,5))
sns.countplot(x='Category',data=data)

# Price column

In [None]:
data['Price'].value_counts()

In [None]:
data['Price'].describe()

## Let convert this into Numerical value

In [None]:
### Eliminating signs
data['Price']=data['Price'].str.replace(',', '')
data['Price']=data['Price'].str.replace('₹', '')

In [None]:
data['Price'].tail()

In [None]:
### amaking Free=0
def con(x):
    if x=='Free':
        return 0
    else:
        return x

In [None]:
data['Price']=data['Price'].apply(lambda x:con(x))

In [None]:
data['Price']=pd.to_numeric(data['Price'],errors='coerce')

In [None]:
data['Price']=data['Price'].astype(int)

In [None]:
data['Price'].tail()

### Lets create a price band with also

In [None]:
r = [-2,0, 100, 200, 500, 1000,2000,5000,10000]
g = ['free','0-100','100-200','200-500','500-1000','1000-2000','2000-5000','>5000']
data['price_band'] = pd.cut(data['Price'], bins=r, labels=g)

In [None]:
data['price_band'].value_counts()

### As we already know the count of free value
### We will check the prices greater than zero 

In [None]:
price_non_Zero=data[data['Price']>0]
price_non_Zero.head()

In [None]:
plt.figure(figsize=(14,5))
sns.distplot(price_non_Zero['Price'],kde=False)

# Now we will compare the columns and come up with useful insights

## Rating vs

### 1-> Rating vs No of people Rated

In [None]:
data.groupby('Rating')['No of people Rated'].describe()

In [None]:
plt.figure(figsize=(10,5))
sns.boxplot(x='Rating',y='No of people Rated',data=data)


###  2-> Rating vs Category

In [None]:
data['Category'].unique()

In [None]:
plt.figure(figsize=(30,5))
sns.barplot(x='Category',y='Rating',data=data)
plt.ylabel("Average rating")

In [None]:
plt.figure(figsize=(34,5))
sns.violinplot(y='Rating',x='Category',data=data)
plt.ylabel("Average Rating")

##  3-> Rating vs price

In [None]:
plt.figure(figsize=(30,5))
sns.barplot(x='price_band',y='Rating',data=data)
plt.ylabel("Average Rating")

In [None]:
plt.figure(figsize=(30,5))
sns.violinplot(x='price_band',y='Rating',data=data)

# Category vs

## 1-> Category vs No of people Rated

In [None]:
plt.figure(figsize=(40,5))
sns.barplot(x='Category',y='No of people Rated',data=data)
plt.ylabel("Average No of people rated")

In [None]:
plt.figure(figsize=(40,5))
sns.violinplot(x='Category',y='No of people Rated',data=data)

## 2->  Category vs price

In [None]:
plt.figure(figsize=(30,5))
sns.stripplot(x='Category',y='Price',data=data,jitter=True)

In [None]:
plt.figure(figsize=(20,5))
sns.countplot(x=data['Price']==0,hue=data['Category'])