# 1- Preprocessing

## Open the file

In [4]:
import pandas as pd
import numpy as np

## Examine file

In [5]:
df = pd.read_csv('hw_data.csv')
df

Unnamed: 0.1,Unnamed: 0,Age,Gender,Education Level,Income,Marital Status,Employment Status,Product Category,Satisfaction Level,Debt Status
0,0,500,Female,,31158.0,,,,9,1
1,1,73,Female,College,54465.0,,0.0,E,5,1
2,2,68,Male,High School,37427.0,married,0.0,D,1,1
3,3,61,Female,College,52210.0,widowed,0.0,G,8,1
4,4,34,Female,College,59325.0,separated,1.0,A,5,0
...,...,...,...,...,...,...,...,...,...,...
105,56,85,Female,University,39063.0,divorced,,F,8,0
106,0,500,Female,,31158.0,,,,9,0
107,73,39,Female,College,55750.0,married,1.0,F,5,1
108,28,80,Female,High School,32142.0,separated,,C,7,1


## Define missing values

In [6]:
df.isnull().sum()

Unnamed: 0             0
Age                    0
Gender                 0
Education Level       17
Income                12
Marital Status        19
Employment Status     35
Product Category      13
Satisfaction Level     0
Debt Status            0
dtype: int64

## Decide about missing values (fill or drop)

In [7]:
# Education Level => Fill (group by income median)
# Income => Fill  (group by education level median)
df['Marital Status'] = df['Marital Status'].fillna(df.groupby("Age")["Marital Status"].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else None))
df['Employment Status'].fillna(df.groupby(["Age", 'Education Level'])["Marital Status"].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else None))
df['Education Level'].fillna(df.groupby("Income")["Education Level"].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else None))
df['Income'].fillna(df.groupby("Education Level")["Income"].transform('median'))
df['Product Category'].fillna(df['Product Category'].mode()[0], inplace=True)






## Why did you fill missing values or if you have dropped, why did you do so? 

In [8]:
# Fill all the missing value because missing values is 10% - 30% of data, which is not good to lost this data

## Find outliers if there are

In [9]:
def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def check_outlier(dataframe, col_name):
    low_limit , up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False
    
numeric_cols = df.select_dtypes(include=['number']).columns
for i in numeric_cols:
    print(i, check_outlier(df, i))
    

def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable]> up_limit), variable] = up_limit

Unnamed: 0 False
Age True
Income True
Employment Status False
Satisfaction Level False
Debt Status False


## Decide what will you do with outliers

In [10]:
numeric_cols = df.select_dtypes(include=['number']).columns
for i in numeric_cols:
    replace_with_thresholds(df, i)

  dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
  dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
  dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit


## Find duplicates, how many are there? Decide what will you do about that?

In [11]:
duplicate_rows = df[df.duplicated()]
duplicate_rows

Unnamed: 0.1,Unnamed: 0,Age,Gender,Education Level,Income,Marital Status,Employment Status,Product Category,Satisfaction Level,Debt Status
100,29,39.0,Male,High School,,married,1.0,B,4.0,0.0
101,86,31.0,Female,High School,79096.75,separated,1.0,G,4.0,1.0
103,59,90.0,Male,College,79096.75,widowed,0.0,C,6.0,1.0
109,30,164.625,Female,High School,18442.0,separated,1.0,A,3.0,0.0


In [12]:
df.drop_duplicates(inplace=True)


## Explain why did you do that?

In [13]:
# To create a data with unique information in each row

duplicate_rows = df[df.duplicated()]
duplicate_rows


Unnamed: 0.1,Unnamed: 0,Age,Gender,Education Level,Income,Marital Status,Employment Status,Product Category,Satisfaction Level,Debt Status


# 2- Categorization

## Categorize satisfaction in 3 level and label them as 'Low', 'Moderate' and 'High'. Add a new column named 'Satisfaction Category' and store them there

#### ||| Low Satisfaction : (1-3) ||| Moderate Satisfaction : (4-7) ||| High Satisfaction : (8-10) |||

In [14]:
def categorize_satisfaction(level):
    if 1 <= level <= 3:
        return 'Low'
    elif 4 <= level <= 7:
        return 'Moderate'
    else:
        return 'High'


df['Satisfaction Category'] = df['Satisfaction Level'].apply(categorize_satisfaction)

In [15]:
df

Unnamed: 0.1,Unnamed: 0,Age,Gender,Education Level,Income,Marital Status,Employment Status,Product Category,Satisfaction Level,Debt Status,Satisfaction Category
0,0,164.625,Female,,31158.0,separated,,G,9.0,1.0,High
1,1,73.000,Female,College,54465.0,,0.0,E,5.0,1.0,Moderate
2,2,68.000,Male,High School,37427.0,married,0.0,D,1.0,1.0,Low
3,3,61.000,Female,College,52210.0,widowed,0.0,G,8.0,1.0,High
4,4,34.000,Female,College,59325.0,separated,1.0,A,5.0,0.0,Moderate
...,...,...,...,...,...,...,...,...,...,...,...
104,84,63.000,Male,High School,45571.0,married,,D,0.0,0.0,High
105,56,85.000,Female,University,39063.0,divorced,,F,8.0,0.0,High
106,0,164.625,Female,,31158.0,separated,,G,9.0,0.0,High
107,73,39.000,Female,College,55750.0,married,1.0,F,5.0,1.0,Moderate


## Categorize income in 3 level and label them as 'Low', 'Moderate' and 'High'. Add a new column named 'Income Category' and store them there

In [16]:

quantiles = [0, 0.33, 0.66, 1]


labels = ['Low', 'Moderate', 'High']


df['Income Category'] = pd.qcut(df['Income'], q=quantiles, labels=labels)

df

Unnamed: 0.1,Unnamed: 0,Age,Gender,Education Level,Income,Marital Status,Employment Status,Product Category,Satisfaction Level,Debt Status,Satisfaction Category,Income Category
0,0,164.625,Female,,31158.0,separated,,G,9.0,1.0,High,Low
1,1,73.000,Female,College,54465.0,,0.0,E,5.0,1.0,Moderate,High
2,2,68.000,Male,High School,37427.0,married,0.0,D,1.0,1.0,Low,Moderate
3,3,61.000,Female,College,52210.0,widowed,0.0,G,8.0,1.0,High,High
4,4,34.000,Female,College,59325.0,separated,1.0,A,5.0,0.0,Moderate,High
...,...,...,...,...,...,...,...,...,...,...,...,...
104,84,63.000,Male,High School,45571.0,married,,D,0.0,0.0,High,High
105,56,85.000,Female,University,39063.0,divorced,,F,8.0,0.0,High,Moderate
106,0,164.625,Female,,31158.0,separated,,G,9.0,0.0,High,Low
107,73,39.000,Female,College,55750.0,married,1.0,F,5.0,1.0,Moderate,High


# 3- Analyse

## Comparing using pivot tables

### Is there a relation between marital status and debt status?

In [17]:
marital_debt_relation = pd.crosstab(index=df['Marital Status'], columns=df['Debt Status'], normalize='index')
marital_debt_relation

Debt Status,0.0,1.0
Marital Status,Unnamed: 1_level_1,Unnamed: 2_level_1
divorced,0.428571,0.571429
married,0.518519,0.481481
never married,0.416667,0.583333
separated,0.592593,0.407407
widowed,0.666667,0.333333


### Is there a relation between income category and debt status?

In [18]:
income_debt_relation = pd.crosstab(index=df['Income'], columns=df['Debt Status'], normalize='index')
income_debt_relation

Debt Status,0.0,1.0
Income,Unnamed: 1_level_1,Unnamed: 2_level_1
6740.00,0.0,1.0
7010.00,1.0,0.0
10845.00,0.0,1.0
14106.00,0.0,1.0
16647.00,0.0,1.0
...,...,...
59325.00,1.0,0.0
60010.00,0.0,1.0
64165.00,0.0,1.0
64730.00,0.5,0.5


### Is there a relation between education level and satisfaction category?

In [19]:
education_satisfaction_relation = pd.crosstab(index=df['Education Level'], columns=df['Satisfaction Category'], normalize='index')
education_satisfaction_relation

Satisfaction Category,High,Low,Moderate
Education Level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
College,0.333333,0.222222,0.444444
Elementary,0.125,0.625,0.25
High School,0.28,0.28,0.44
University,0.366667,0.166667,0.466667


### Is there a relation between employment status and satisfaction category?

In [20]:
employment_satisfaction_relation = pd.crosstab(index=df['Employment Status'], columns=df['Satisfaction Category'], normalize='index')
employment_satisfaction_relation

Satisfaction Category,High,Low,Moderate
Employment Status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,0.290323,0.290323,0.419355
1.0,0.175,0.325,0.5


# 4- General Conclusion

| Marital Status | Debt Status 1 (%) | Debt Status 0 (%) |
|----------------|-------------------|-------------------|
| Divorced       | 58.33             | 41.67             |
| Married        | 47.83             | 52.17             |
| Never Married  | 60.0              | 40.0              |
| Separated      | 43.9              | 56.1              |
| Widowed        | 35.0              | 65.0              |


---

### Checklist

- [x]  start homework;
- [ ]  file open;
- [ ]  file examined;
- [ ]  missing values defined;
- [ ]  missing values are filled;
- [ ]  explanation for the possible causes of missing values;
- [ ]  an explanation of how the blanks are filled;
- [ ]  an explanation of how the blanks are filled;
- [ ]  finding outliers;
- [ ]  handling outliers;
- [ ]  duplicates showed;
- [ ]  duplicates deleted;
- [ ]  an explanation of which method is used to find and remove duplicates;
- [ ]  data is categorized;
- [ ]  an answer to the question " Is there a relation betweenmarital status and debt status?";
- [ ]  an answer to the question " Is there a relation between income category and debt status?";
- [ ]  an answer to the question " Is there a relation between education level and satisfaction category?";
- [ ]  an answer to the question " Is there a relation between employment status and satisfaction category?"
- [ ]  conclusions are present on each stage;
- [ ]  a general conclusion is made.

---