In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
#This is to look at what all unique values have
list_col=['Product','MaritalStatus','Usage','Fitness','Education','Age']
for col in list_col: 
    print('{} :{} ' . format(col.upper(),df[col].unique()))

In [None]:
df.Product.value_counts()

In [None]:
#Are Male customers buying treadmill more than female customers?
df.Gender.value_counts()

In [None]:
#Are married customer buying Treadmill more than Single customers?
df.MaritalStatus.value_counts()

Observation: There are 107 Partnered and 73 single customers. Customers who are Partnered are buying treadmill more compared to single customer.

In [None]:
df['Usage'].value_counts()

In [None]:
df['Fitness'].value_counts()

A broader look at correlation between columns of dataframe

In [None]:
df_copy = df.copy()

df_copy['Gender'].replace(['Male','Female'],[1,0],inplace=True)
df_copy['MaritalStatus'].replace(['Single','Partnered'],[0,1],inplace=True)
df_copy['Product'].replace(['KP281','KP481','KP781'],[0,1,2],inplace=True)


In [None]:
plt.figure(figsize=(15,6))
sns.heatmap(df_copy.corr(),cmap='coolwarm',annot=True)
plt.show()

Observations:
1. Treadmill purchased has high correlation with Education, income, usage, fitness and miles.
2. Age is highly correlated to income which definitely seems reasonable. It is also correlated with Education and Marital Status which stands completely alright.
3. Gender has some correlation to usage, income, fitness and miles.
4. Education is correlated to age and miles. It is highly correlated to income. It is sufficiently correlated to usage and fitness.
5. Marital Status has some correlation to income and age.
6. Usage is extremely correlated to fitness and miles and has a higher correlation to income as well.
7. Fitness has a great correlation to income.

More Observations and Possibilities:
1. Product, fitness, usage and miles depict higher correlation among themselves which is expected.
2. Age and education are indicators of income which affects the products bought. The more advanced the product, the more is the usage and miles resulting in more fitness.

# Univariate Analysis

### Age

In [None]:
sns.kdeplot(x='Age',data=df)
plt.show()

In [None]:
sns.boxplot(y='Age',data=df)
plt.show()

The above two plots show that we have ages ranging from 18 to 50. Most frequent age is 25.

### Gender

In [None]:
df['Gender'].value_counts()

In [None]:
sns.countplot(x='Gender',data=df)
plt.show()

In [None]:
n_Male = df[df['Gender'] == 'Male'].shape[0]  # We are getting the number of datapoints in each category of treadmills
n_Female = df[df['Gender'] == 'Female'].shape[0]
data = [n_Male, n_Female]
labels=['Male','Female']

plt.pie(data,
       labels=labels,
       autopct='%.0f%%') # To show the portions in %ages

plt.show()

### Education 

In [None]:
sns.boxplot(y='Education',data=df)
plt.show()

In [None]:
df['Education'].unique()

In [None]:
df['Education'].value_counts()

In [None]:
sns.histplot(x='Education',data=df)
plt.show()

20 or more years of education are outliers.
Maximum of our data lies between 14 to 16 years of education.

### Marital Status

In [None]:
df['MaritalStatus'].value_counts()

In [None]:
sns.countplot(x='MaritalStatus',data=df)
plt.show()

In [None]:
n_Single = df[df['MaritalStatus'] == 'Single'].shape[0]  # We are getting the number of datapoints in each category of treadmills
n_Married = df[df['MaritalStatus'] == 'Partnered'].shape[0] 
data = [n_Single, n_Married]
labels=['Single','Partnered']

plt.pie(data,
       labels=labels,
       autopct='%.0f%%') # To show the portions in %ages

plt.show()

### Usage

In [None]:
df['Usage'].unique()

In [None]:
sns.histplot(x='Usage',bins=30,data=df)
plt.show()

In [None]:
sns.boxplot(y='Usage',data=df)
plt.show()

Most people use treadmills 3 to 4 times a week.
6 to 7 times of usage per week is rare.

### Fitness

In [None]:
sns.histplot(x='Fitness',bins=30,data=df)
plt.show()

In [None]:
sns.boxplot(y='Fitness',data=df)
plt.show()

Most people have an average body shape. Very less people have poor body shape.

### Income

In [None]:
sns.kdeplot(x='Income',data=df)
plt.show()

In [None]:
sns.boxplot(y='Income',data=df)
plt.show()

People mostly earn between 40k to 60k.
People earning more than 80k are less.

### Miles

In [None]:
sns.kdeplot(x='Miles',data=df)
plt.show()

In [None]:
sns.boxplot(y='Miles',data=df)
plt.show()

People mostly cover 60 to 120 miles per week.
175 miles or more are rare.

# Marginal Probability

In [None]:
n_KP281 = df[df['Product'] == 'KP281'].shape[0]  # We are getting the number of datapoints in each category of treadmills
n_KP481 = df[df['Product'] == 'KP481'].shape[0]
n_KP781 = df[df['Product'] == 'KP781'].shape[0]
data = [n_KP281, n_KP481, n_KP781]
labels=['KP281','KP481','KP781']

plt.pie(data,
       labels=labels,
       autopct='%.2f%%') # To show the portions in %ages

plt.show()

# KP281 is most bought product overall
# This is Marginal Probability:
    # 44% customers bought KP281
    # 33% customers bought KP481
    # 22% customers bought KP781

Marginal Probabilities of customers buying KP281,KP481,KP781 are 44%, 33%, 22% respectively.

Recommendations:
1. If a random customer comes in, chances of buying KP281 is the highest.
2. We should suggest KP281 to a customer.

# Bivariate Analysis

### Age and Product

In [None]:
sns.boxplot(x='Product',y='Age',data=df)
plt.show()

In [None]:
df['Age'].unique()

As there are many unique values of ages, it is better to divide these into different ranges to get better insights.

In [None]:
bins=[0,25,35,45,55]
group=['Below 25','25-35','35-45','45 above']

df['Age group']= pd.cut(df['Age'],bins,labels=group)


In [None]:
sns.countplot(x='Age group',hue='Product',data=df)
plt.show()

### Marginal Probabilities of age groups and products 

In [None]:
pd.crosstab(df["Age group"], df["Product"], normalize=True,margins=True)*100

### Conditional Probabilty Of Product given Age group 

In [None]:
pd.crosstab(df["Age group"], df["Product"], normalize='index')*100

Recommendation:
1. We can see that in the age group of below 35 years, KP281 is the most popular.
2. We should recommend KP281 to those below 35 years.
3. Second popular is KP481, then KP781.

### Gender and Product

In [None]:
np.round((df['Gender'].value_counts()/len(df))*100,2)

In [None]:
sns.countplot(x='Gender',hue='Product',data=df)
plt.show()

We can see that there is a significant difference in purchase of KP781 in males than females.

Recommendations:
1. Recommend KP781 to males as there is a high buying probability than females.
2. Whereas KP281 has high buying probability in females as compared to males.

Conditional Probability:

In [None]:
pd.crosstab(df["Gender"], df["Product"], margins=True,normalize='index')*100

Marginal Probability:

In [None]:
pd.crosstab(index=df['Gender'],columns=df['Product'],margins=True)

In [None]:
pd.crosstab(df["Gender"], df["Product"],margins=True,normalize=True)*100

### Marital Status and Product

In [None]:
np.round((df['MaritalStatus'].value_counts()/len(df))*100,2)

In [None]:
sns.countplot(x='MaritalStatus',hue='Product',data=df)
plt.xlabel('Marital Status')
plt.show()

# Married people prefer all the three products more than single people
# We can focus on married people

### Conditional Probabilty Of Product given Marital Status

In [None]:
pd.crosstab(df["MaritalStatus"], df["Product"], normalize='index')

### Marginal Probabilties Of marital status and product

In [None]:
pd.crosstab(df["MaritalStatus"], df["Product"],margins=True ,normalize=True)*100

There is no significant difference in married and single people in all the three categories of treadmills. 
But KP281 is the most popular in both single and married people

### Income and Product

In [None]:
sns.boxplot(y='Income',x='Product',data=df)
plt.show()

In [None]:
bins=[0,40000,60000,80000,120000]
group=['Below 40k','40k-60k','60k-80k','Above 80k']

df['Income bins']= pd.cut(df['Income'],bins,labels=group)

In [None]:
sns.countplot(x='Income bins',hue='Product',data=df)
plt.show()

### Marginal Probabilties Of Product and Income

In [None]:
pd.crosstab(df["Income bins"], df["Product"], normalize=True,margins=True)*100

### Conditional Probabilty Of Product given Income

In [None]:
pd.crosstab(df["Income bins"], df["Product"], normalize='index')*100

As we can observe from the above charts and conditional probability table that,
1. People whose income is below 40k prefer KP281 the most.
2. People wo earn more than 80k prefer KP781.
3. People whose income ranges from 40k to 80k prefer KP281 and KP481, there is not much difference in probabilites.

### Product and Fitness

In [None]:
sns.barplot(x='Product',y='Fitness',data=df)
plt.show()

In [None]:
sns.countplot(x='Fitness',hue='Product',data=df)
plt.show()

### Marginal Probabilties Of Product and Income

In [None]:
pd.crosstab(df["Fitness"], df["Product"], normalize=True,margins=True)*100

### Conditional Probabilty Of Product given fitness

In [None]:
pd.crosstab(df["Fitness"], df["Product"], normalize='index')*100

Observations:
1. KP781 is the most popular in those with excellent body shape.
2. KP281 is the most popular in those who have average body shape (3).

Recommendations:
1. We can recommend KP781 to those with excellent body shape and KP281 to those with average body shape.

### Usage and Product

In [None]:
sns.barplot(x='Product',y='Usage',data=df)
plt.show()

In [None]:
sns.countplot(x='Usage',hue='Product',data=df)
plt.show()

### Marginal Probabilties Of Product and fitness

In [None]:
pd.crosstab(df["Usage"], df["Product"], normalize=True,margins=True)*100

### Conditional Probabilty Of Product given usage

In [None]:
pd.crosstab(df["Usage"], df["Product"], normalize='index')

Observations:
1. Those who use treadmills 5 times a week or more prefer KP781.
2. Those who use treadmills 2 to 4 times a week prefer KP281.

### Miles and Product

In [None]:
sns.boxplot(y='Miles',x='Product',data=df)
plt.show()

Miles and usage of treadmills are directly proportional to each other. If a person is using a treadmill more per week will 
obviously run more miles than others.

### Education and Product

In [None]:
bins=[0,14,18,22]
group=['Less than 14','14-18','More than 18']

df['Years of Education']= pd.cut(df['Education'],bins,labels=group)

In [None]:
sns.countplot(x='Years of Education',hue='Product',data=df)
plt.show()

In [None]:
sns.boxplot(x='Product',y='Education',data=df)
plt.show()

### Marginal Probabilities of Product and Education

In [None]:
pd.crosstab(df["Years of Education"], df["Product"], normalize=True,margins=True)*100

### Conditional Probability of Product given Education

In [None]:
pd.crosstab(df["Years of Education"], df["Product"], normalize='index')*100

Observatiosn:
1. KP781 is more popular in highly educated people (18 years or more).
2. KP281 and KP481 are popular in moderately educated people (less than 18 years).

# Multi variate analysis

### Effect of age and income on product

In [None]:
sns.scatterplot(x='Age', y='Income',data=df, hue = 'Product')
plt.show()
#Looks like KP781 is the most expensive model (very niche)

Variance of income in lower ages is smaller as compared to variance in higher ages, called as heteroscadasticity.

# Plotting of Probabilities of each factor affecting the product

In [None]:
cat_cols = ['Gender','MaritalStatus','Usage','Fitness','Age group','Income bins','Years of Education']
for i in cat_cols:
    pd.crosstab(df[i], df["Product"], normalize='index').plot(kind='bar',figsize=(10,4))
    plt.xticks(rotation=0)
    plt.ylabel('Proportion')

Observations:

1. Around 55% of women prefer KP281 and only 10% prefer KP781. While around 35% of men prefer KP781.
2. People having education of more than 18 years prefer KP781. While those with less than 14 years of education prefer KP281.
3. There is no significant difference in married and single people in all the three categories of treadmills. But KP281 is the most popular in both single and married people.
4. Those who use treadmill 5 times or more prefer KP781. While those who use less than 4 prefer PK281.
5. 95% of customers having fitness levels of 5 use KP781 and none of those having fitness level less than 3 use KP781.
6. People who earn less than 40k prefer KP281 the most. While those who earn more than 80k prefer KP781.