# Data Analytics Coursework
**Oliwier Kulczycki**

# Setup
I personally suggest using a Python virtual env (venv).
Requirements.txt is provided, or you can see below for imports.

In [None]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as stats

In [None]:
path_full = "credits-clean.csv"
df = pd.read_csv(path_full)

bar_width = 0.25

Show the original dataframe for reference.

In [None]:
df

***

# Dataset Versions
*(Nominal and Numeric Conversions)*

## Converting to numeric
For this task, I decided to use a method which 'enumrates' through every unique value within each column and assigns it an integer.

In [None]:
# Have to create a copy of the dataframe using df.copy(),
# otherwise the variable df_numeric would just end up pointing to df, instead of being separate.
# This messed me up for a while as the rest of the program wouldn't work.
df_numeric = df.copy()

In [None]:
# Go through each column.
for col in df_numeric.columns:
    # Avoid the already numeric fields.
    if col == 'Case_no' or col == 'credit_amount' or col == 'age':
        continue
    df_numeric[col] = df_numeric[col].astype('category').cat.codes

# Show the work done.
df_numeric

In [None]:
for col in df_numeric.columns:
    if col == 'Case_no' or col == 'credit_amount' or col == 'age':
        continue
    categories = df[col].astype('category').cat.categories
    codes = list(range(len(categories)))
    mapping = dict(zip(codes, categories))
    print(f"{col} mapping:", mapping)

Creates the numeric version of the .csv file.

In [None]:
# Have to include 'index=False'.
pd.DataFrame.to_csv(df_numeric, "credits-clean-numeric(indexed).csv", index=True) # bad
pd.DataFrame.to_csv(df_numeric, "credits-clean-numeric.csv", index=False)

## Converting to nominal
Grouping numeric values into a text representation.

In [None]:
df_nominal = df.copy()

Casting the Case_no to string. There is no good string representation of these as these are indexes.

In [None]:
# Show type of attribute before any changes are made.
print("Type pre:", df_nominal['Case_no'].dtype)

# Casting the type to string.
df_nominal['Case_no'] = df['Case_no'].astype(str)

# Display and prove change of type.
print("Type post:", df_nominal['Case_no'].dtype)
# Output should come out as simply 'object'

Deciding the ranges for each label.

In [None]:
# Bins for the 'age' attribute.
bins_age = [0, 18, 30, 60, 80]
labels_age = ["child",
              "young-adult",
              "adult",
              "elderly"]

# Bins for the 'credit_amount' attribute.
bins_credit_amount = [0, 500, 1000, 2000, 5000, 10000, 100000]
labels_credit_amount = ["very-low",
                        "low",
                        "moderate",
                        "high",
                        "really-high",
                        "exceptional"]

Applying the value bins.

In [None]:
# Use the pd.cut() as per the documentation.
# This is a tool already made for binning and classification.
df_nominal['age'] = pd.cut(df_nominal['age'], bins=bins_age, labels=labels_age)
df_nominal['credit_amount'] = pd.cut(df_nominal['credit_amount'], bins=bins_credit_amount, labels=labels_credit_amount)

# Show work done.
print(df_nominal['credit_amount'].value_counts())
print(df_nominal['age'].value_counts())

Save the file

In [None]:
pd.DataFrame.to_csv(df_nominal, "credits-clean-nominal.csv", index=False)

***

# Data visualisation
Using pyplot.

## Distribution of Credit Class by Personal Status
- Count the occurrences of 'good' and 'bad' for each unique entry in the 'personal_status' field.
- Group them together for easy displaying.
- Plot as a double bar chart.

### Counts
Using the .value_counts() we can calculate the occurrences of each credit class for each unique personal status.

In [None]:
# Do counts for each 'personal_status' value where 'class' attribute is 'good'.
counts_good_personal_status = df[df['class'] == 'good']['personal_status'].value_counts()

# Do the same as above except only for when the 'class' attribute is 'bad'.
counts_bad_personal_status = df[df['class'] == 'bad']['personal_status'].value_counts()

### Data Preperation
Plotting is done using a double-bar chart. This way we can more easily and effectively display what is most important. This also acts like a point of comparison between the two credit classes.
A width parameter is necessary to avoid bars overlapping with each other on the same x-values.

#### Decoration
First the actual figure is created along with an axes. Some decoration is created; title, and labels.

In [None]:
# Create a figure and axes object for the creation of the graph.
fig_personal_status, g1 = plt.subplots()

# Minor details and decorations for easier viewing and labels.
g1.set_title("Distribution of Credit Class by Personal Status")
g1.set_xlabel("Personal Status")
g1.set_ylabel("Counts");

# I can't get it to not show the empty graph
    # (at least on PyCharm)

#### X-Axis Labels
The X-axis labels have to be evenly arranged, for this the .set_xticks() function is used to tell pyplot where to set the x-axis labels.
The .unique() pandas function is used to get all the unique saving_status entries. These are then used as x-axis labels from where the bar charts will have their roots.

In [None]:
# Calculate the positions of each bar origin.
# This will save the positions which can then be used to calculate the offset for the two bars.
statuses = df['personal_status'].unique()
x = np.arange(len(statuses))
g1.set_xticks(x)
g1.set_xticklabels(statuses)

### Plotting

In [None]:
# Plotting the finished data using the .bar() function to create bar graphs.
g1.bar(x - bar_width/2, counts_good_personal_status, bar_width, label='Good Credit Class', color='#06d6a0')

# By plotting it twice and offsetting them from the original positions decided by 'x' variable,
# it's possible to display both right next to each other.
g1.bar(x + bar_width/2, counts_bad_personal_status, bar_width, label='Bad Credit Class', color='#ef476f')
g1.legend()

# Display final figure (graph).
fig_personal_status

## Distribution of Credit Class by Saving Status
- Count the occurrences of 'good' and 'bad' for each unique entry in the 'saving_status' field.
- Group them together for easy displaying.
- Plot as a double bar chart.
- *(this is the same as the task above just with a different field)*.

### Counting
Counting is done the same way as the chart above.

In [None]:
# Counting (same as before)
counts_good_saving_status = df[df['class'] == 'good']['saving_status'].value_counts()
counts_bad_saving_status = df[df['class'] == 'bad']['saving_status'].value_counts()

### Data Preperation

#### Decoration
First the actual figure is created along with an axes. Some decoration is created; title, and labels.

In [None]:
# Create NEW figure and axes objects (same as before)
fig_saving_status, g2 = plt.subplots()
g2.set_title("Distribution of Credit Class by Saving Status")
g2.set_xlabel("Saving Status")
g2.set_ylabel("Counts")

#### X-Axis Labels
The X-axis labels have to be evenly arranged, for this the .set_xticks() function is used to tell pyplot where to set the x-axis labels.
The .unique() pandas function is used to get all the unique saving_status entries. These are then used as x-axis labels from where the bar charts will have their roots.

In [None]:
statuses = df['saving_status'].unique()
x = np.arange(len(statuses))
g2.set_xticks(x)
g2.set_xticklabels(statuses)

### Plotting

In [None]:
 # Plotting here.
g2.bar(x - bar_width/2, counts_good_saving_status, bar_width, label='Good Credit Class', color='#06d6a0')
g2.bar(x + bar_width/2, counts_bad_saving_status, bar_width, label='Bad Credit Class', color='#ef476f')
g2.legend()

fig_saving_status

***

# Statistical Analysis

Get expected dataset (entire cleaned dataset)
shorten it to the length of personal_status and saving_status
get chi score for each personal status


In [None]:
# Expected data from global (non split by personal_status/saving_status fields) from dataset.
global_ps = df['class'].value_counts()
global_ratios = global_ps/global_ps.sum()

# Pre-make dictionaries containing data.
local_personal_status = {}
expected_personal_status = {}

# Same for saving status
local_saving_status = {}
expected_saving_status = {}

# Loop through each unique category in the attribute.
for value in df['personal_status'].unique():
    # Get observed results using boolean masking
    local_personal_status[value] = (df[df['personal_status'] == value]['class'].value_counts())
    # Calculate the expected result based on the size of the occurences of the category within the attribute.
    expected_personal_status[value] = global_ratios * local_personal_status[value].sum()

for value in df['saving_status'].unique():
    local_saving_status[value] = (df[df['saving_status'] == value]['class'].value_counts())
    expected_saving_status[value] = global_ratios * local_saving_status[value].sum()

## Calculate the Chi Squared Pvalues.
Chi Squared results are calculated using the built-in scipy function.

In [None]:
# Chi Squared Calculation
full_chisquare_personal_status = {}
for key in local_personal_status.keys():
    full_chisquare_personal_status[key] = stats.chisquare(f_obs=local_personal_status[key], f_exp=expected_personal_status[key])

full_chisquare_saving_status = {}
for key in local_saving_status.keys():
    full_chisquare_saving_status[key] = stats.chisquare(f_obs=local_saving_status[key], f_exp=expected_saving_status[key])

### Print all the results
Show all the pvalues.

In [None]:
# Printing Results
print("Personal Status:")
for key in full_chisquare_personal_status.keys():
    print(f"{key}"
          f"\nChiScore: {full_chisquare_personal_status[key][0]}"
          f"\nPValue: {full_chisquare_personal_status[key][1]}")
    print("")

print("Saving Status:")
for key in full_chisquare_saving_status.keys():
    print(f"{key}"
          f"\nChiScore: {full_chisquare_saving_status[key][0]}"
          f"\nPValue: {full_chisquare_saving_status[key][1]}")
    print("")

### Print all problematic results.
List for easy viewing a list of all categories which exceed the threshold pvalue.

In [None]:
# List all attributes which deviated greatly from the global distribution of good/bad credit class.
problematic_list = []
for key in full_chisquare_personal_status.keys():
    if full_chisquare_personal_status[key][1] <= 0.05:
        problematic_list.append([key, full_chisquare_personal_status[key][1]])
for key in full_chisquare_saving_status.keys():
    if full_chisquare_saving_status[key][1] <= 0.05:
        problematic_list.append([key, full_chisquare_saving_status[key][1]])


for i in range(len(problematic_list)):
    print(problematic_list[i][0] + ":")
    print("Pvalue: " + str(problematic_list[i][1]))
    print("")