# **Unit 1 - Data Preperation**

## 1.1 - Variable Types

### Import Libraries

In [None]:
import numpy as np
import pandas as pd

### Preview Data

In [None]:
df.head() #previews the first 5 rows
df.head(10) #previews the first 10 rows
pd.set_option('display.max_columns', None) #show all columns in the df

### Data Types

In [None]:
df.info() #to ID Python var types

int_var = 10
float_var = 10.5
str_var = 'Hello'
bool_var = True

### Type Conversion

In [None]:
float_var = float(int_var)
int_var = int(float_var)
str_var = str(int_var)
bool_var = bool(int_var)

### Convert Data To Datetime

In [None]:
df['new column name'] = df['new column name'].apply(pd.to_datetime) #create new column and convert to datetime format

### Basic Operations

In [None]:
sum_var = int_var + float_var
product_var = int_var * float_var
diff_var = float_var - int_var
div_var = float_var / int_var

### String Operations

In [None]:
str_concat = str_var + ' World'
str_upper = str_var.upper()
str_split = str_var.split('e')

### List Operations

In [None]:
list_var = [1, 2, 3, 4, 5]
list_append = list_var.append(6)
list_remove = list_var.remove(3)
list_slice = list_var[1:4]

### Dictionary Operations

In [None]:
dict_var = {'key1': 'value1', 'key2': 'value2'}
dict_keys = dict_var.keys()
dict_values = dict_var.values()
dict_update = dict_var.update({'key3': 'value3'})

### Looping and Conditionals

In [None]:
for i in range(5):
    print(i)

if int_var > 5:
    print('Greater than 5')
else:
    print('Less than or equal to 5')

### Example Visualizations for Qualitative Vars

In [None]:
# below is code to create a bar chart
df['column name'].value_counts().plot(kind='bar')
plt.title('Chart Title')
plt.xlabel('x axis name')
plt.ylabel('y axis name')
plt.show()

### Example Visualizations for Quantitative Var

In [None]:
# below is code to create a boxplot
df['column name'].plot(kind='box')
plt.show()

### Example Visualizations for Qualitative Continours Var

In [None]:
# below is code to create a scatterplot
sns.scatterplot(x = df['column name1'], y = df['column name2'])

## 1.2 - Data Sources

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# add more as needed

### Read in Datasets of Various File Types

In [None]:
df = pd.read_csv('data.csv')
df = pd.read_excel('data.xlsx')
df = pd.read_json('data.json')
df = pd.read_html('https://www.website.com/table')
df = pd.read_sql('SELECT * FROM table', connection)

### Join Two Datasets

In [None]:
df = df.merge(specify_df, on='column name', how='left') #specify_df should be the df name

### Filter down observations within a Column by searching for specific text strings

In [None]:
df = df[df['column'].str.contains('string')] # string is contained
df = df[df['column'].str.startswith('string')] # string starts with ...
df = df[df['column'].str.endswith('string')] # string ends with ...

df = df[df['column'].str.contains('string', case=False)] # will ignore upper and lowercasing of words, default is case=True
df = df[df['column'].str.startswith('string', case=False)]
df = df[df['column'].str.endswith('string', case=False)]

### View counts of each value within a column

In [None]:
df['column'].value_counts()

### View columns within dataframe

In [None]:
df.columns

### Created a sorted bar chart in descending order

In [None]:
df = df.sort_values(by='column', ascending=False)
df[['bar variable','height variable']].plot(kind='bar', x='bar variable', y='height variable')
plt.tile('Title')
plt.xlabel('x-axis label')
plt.ylabel('y-axis label')
plt.show()

### Subset a dataframe to only contain values from a particular column

In [None]:
df_subset = df[['column'] == 'value']

### Store API Credentials then Set up Request for OpenWeather Map API

In [None]:
api_key = 'API_KEY'
lat = 'LATITUDE'
lon = 'LONGITUDE'
url = f'https://api.openweathermap.org/data/2.5/weather?lat={lat}&lon={lon}&appid={api_key}'

### Run Request and Return Error if it Doesn't Work

In [None]:
# Send a GET request to the API
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Convert the JSON response to a pandas DataFrame
    weather_data = response.json()
    weather_df = pd.DataFrame([weather_data], columns=weather_data.keys())

    # Display the DataFrame
    print(weather_df)
else:
    print("Failed to retrieve data: Status code", response.status_code)

### Create a bar chart showing the max value for specific dates

In [None]:
df['date variable'] = pd.to_datetime(df['date variable']).dt.date
df_max = df.groupby('date variable')['column to show max of'].max()

df_max.plot(kind='bar')
plt.tile('Title')
plt.xlabel('x-axis label')
plt.ylabel('y-axis label')
plt.show

## 1.3 Data Cleaning

### Import Libraries

In [None]:
import pandas as pd
import numpy as np

### Read in Dataset of Various File Types

In [None]:
df = pd.read_csv('data.csv')
df = pd.read_excel('data.xlsx')
df = pd.read_json('data.json')

### Preview First 5 Rows of Dataframe

In [None]:
df.head()

### Strip leading or trailing whitespaces from data within columns

In [None]:
df['column name'] = df['column name'].str.strip() #trims values within a column

In [None]:
df.columns = df.columns.str.strip() #trims headers only

### Retain only Observations that Are Complete within a Specific Column

In [None]:
df_full = df[df['column_name'].notna()]

In [None]:
print(pd.isna(df['column_name'])) #check for NaN in a column

### Examine the Count of Values within a Specific Column

In [None]:
df['column_name'].value_counts()

### Replace Values with Other Values

In [None]:
df = df.replace(['old value','old value 2', 'old value 3'], ['new value', 'new value 2', 'new value 3']) # list method for multiple values

In [None]:
df = df.replace('old value', 'new value') # list method for one value

In [None]:
df = df.replace({'old value': 'new value', 'old value 2': 'new value 2', 'old value 3': 'new value 3'}) # dictionary method

### Create Two New columns by Splitting Text in One Existing Column

In [None]:
# identify the delimiter then choose from the below
df[['column_name_1', 'column_name_2']] = df['column_name'].str.split(' ', 1, expand=True) # space deliminted
df[['column_name_1', 'column_name_2']] = df['column_name'].str.split(',', 1, expand=True) # comma deliminted
df[['column_name_1', 'column_name_2']] = df['column_name'].str.split(':', 1, expand=True) # colon deliminted
df[['column_name_1', 'column_name_2']] = df['column_name'].str.split(';', 1, expand=True) # semicolon deliminted
df[['column_name_1', 'column_name_2']] = df['column_name'].str.split('\t', 1, expand=True) # tab deliminted
df[['column_name_1', 'column_name_2']] = df['column_name'].str.split('_', 1, expand=True) # underscore deliminted
df[['column_name_1', 'column_name_2']] = df['column_name'].str.split('-', 1, expand=True) # dash deliminted

### Create New Column based on Where Statement from Existing Column

In [None]:
df['column_name_new'] = np.where(df['column_name'] == 'value', 'value if true', 'value if false')

### Create New Column Based on Multiple Conditions (Nested) from Existing Column

In [None]:
conditions = [
    (df['column_name'] < 'value'),
    (df['column_name'] < 'value 2'),
    (df['column_name'] > 'value 3'),
] # add more conditions as necessary

values = ['value if true', 'value if true 2', 'value if true 3'] # add more values as necessary

df['column_name_new'] = np.select(conditions, values) # create new column based on conditions and values

### Drop Column

In [None]:
df = df.drop('column name', axis=1)

### Set and Sort the Index

In [None]:
df = df.set_index('column name')
df = df.sort_index()

### Export dataset as Excel file

In [None]:
from google.colab import files
df.to_excel('cleaned_dataset.xlsx') #export cleaned data to excel
files.download('cleaned_dataset.xlsx') #download cleaned data as Excel file

# **Unit 2 - Data Understanding**

## 2.1 Descriptive Statistics

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

### Import Dataset

In [None]:
df = pd.read_csv('file.csv')
df = pd.read_excel('file.xlsx')

### Dataframe Info

In [None]:
df.info()
df.columns

### Convert Variables that should be floats by replacing non-numeric values with NaNs then converting to floats

In [None]:
df['column'] = pd.to_numeric(df['column'], errors='coerce')

### Check data type of specific columns

In [None]:
df.dtypes, df[['Column 1', 'Column 2']].head()

### Create Subset of Only Certain Variables of Interest

In [None]:
df.columns # view all columns in df

df2 = df[['Column 1', 'Column 2']] # add on as necessary

### Visualize Distributions of Multiple Variables (Histrogram grid)

In [None]:
df = df.dropna() # drop rows with missing values

df.hist(layout = (4,3), figsize=(12,12), bins = 10) # change layout, figsize and bins to match number of histograms required, desired figure size, and number of bars/bins
plt.show()

### Display Descriptive Stats & Display Max Columns

In [None]:
pd.set_option('display.max_columns', None) # show all columns in dataframe

df.describe() # show all descriptive stats for each column

df.groupby('column').describe() # show descriptive stats for each column grouped by another column

df.groupby('grouping column')[['column1 to be described', 'column2 to be described']].describe() # show descriptive stats for one column grouped by another column

### Display Descriptive Stats of Two Levels of a Group

In [None]:
df.groupby('Grouping Column')[['Column1 to be grouped', 'Column2 to be grouped']].describe().loc[['Group 1', 'Group 2']]

### Sort dataframe values of a specific column

In [None]:
df.sort_values(by='column', ascending = False).head() # descending order
df.sort_values(by='column', ascending = True).head() # ascending order

### Create a Color-Mapped Scatterplot to show the relationship between two continous and one categorical variable

In [None]:
plt.figure(figsize=(10, 6)) # set figure size accordingly

scatter = plt.scatter(df['Continous var 1'], df['Continous var 2'], c=df['Contninous var 2'],
                      cmap = 'viridis', s = 100) # create scatterplot

# add labels for each data point using categorical variable
for i, row in df.iterrows():
    plt.text(row['Continous var 1'], row['Continous var 2'], row['Categorical var'], fontsize = 10, ha='right')

# Add colorbar
colorbar = plt.colorbar(scatter)
colorbar.set_label('Categorical var')

plt.xlabel('Continous var 1')
plt.ylabel('Continous var 2')
plt.title('Scatterplot with Colorbar')
plt.grid(True)

plt.show()

### Subset data by searching for specific values within a column

In [None]:
df_subset = df[df['column'].isin(['value 1', 'value 2'])]

## 2.2 Data Visualizations

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Generate Descriptive Statistics

In [None]:
df['Column_Name'].describe

### Generate descriptive statistics categorized by group membership

In [None]:
df.groupby('Column_Name1')['Column_Name2'].describe()

#To produce descriptive stats table grouped by a variable, without the ID
vars = df.drop(['ID'], axis=1)
vars.groupby('DV').describe().stack(0)

### Create a Histogram to visualize a data distribution

In [None]:
df['Column_Name'].hist()
plt.show()

### Create histogram and specify number of bins and x-value range

In [None]:
plt.hist(df['Column_Name'], bins = x, range = [y,z]) replace x with number of bins, replace y and z with lower and upper-bounds
plt.show()

### Create a boxplot visualization

In [None]:
plt.boxplot(df['Column_Name'])

# produce a side by side boxplot of a quant var grouped by a qual var
sns.boxplot(x='qual var', y='quant var', data=df)
plt.title('title')
plt.xlabel('xlabel')
plt.ylabel('ylabel')
plt.show()

### Creat multiple boxplots categorized by group

In [None]:
df.boxplot(column=['Column_Name1'], by=['Column_Name2'], figsize(x,y))
# where column_name1 is your variable on the y-axis, column_name 2 is the grouping variable, and x,y are the width and height dimensions of the figure

### Create an Aggregation Objects - Counts

In [None]:
category_count = df['Column_Name'].value_counts()

### Create an Aggregation Objects - Mean


In [None]:
category = df['Column_Name'].describe()
cagegory_mean = category['mean']
average_var = df.groupby('qual var')['quant var'].mean()

### Create an Aggregation Objects - Standard Deviation

In [None]:
category = df['Column_Name'].describe()
category_std = category['std']

### Create an Aggregation Objects - Group By

In [None]:
object name = df.groupby(by="grouping variable")["variable to be grouped"].describe()
object_mean = object name['mean']

### Create Bar Chart Using Mean

In [None]:
# Calculate the average quant var grouped by a qual var
average_var = df.groupby('qual var')['quant var'].mean()

# Create the bar chart
average_var.plot(kind='bar')
plt.title('title')
plt.xlabel('xlabel')
plt.ylabel('ylabel')
plt.show()

### Create Stacked Bar Chart Using Data Aggregate Object

In [None]:
plt.bar(x = category_counts.index.values, height = category_counts)
plt.show()

plt.bar(x = category_average.index.values, height = category_average)
plt.show()

plt.bar(x = category_sd.index.values, height = category_sd)
plt.show()

### Create a Scatterplot, Color Coded


In [None]:
sns.scatterplot(x='x var', y='y var', hue='color coded var', data=df)
plt.title('title')
plt.xlabel('x label')
plt.ylabel('y label')
plt.show()

### Create a Pie Chart


In [None]:
plt.pie(category_count.values, labels = category_count.index.values, autopct = '%1.1f%%')
plt.show()

### Adding Titles and Axis Labels
These can be added on as an additional line of code after you specify the the plot type e.g., plt.pie(x, label)


In [None]:
plt.title("Name of Plot")

plt.ylabel("Name of Y-axis")

plt.xlabel("Name of X-axis")

plt.legend()

## 2.3 Outliers + Missing Data

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

pd.set_option('display.max_columns', None) # displays max columns in dataframe

### Read in File as Dataframe

In [None]:
df = pd.read_csv('data.csv')
df = pd.read_excel('data.xlsx')

### Check for Missing Data

In [None]:
df.info()

### Percent Missing Values for a Var

In [None]:
#calcuate percent missing
percent_missing = df['var'].isnull().sum() * 100 / len(df['var'])
print(f'{percent_missing:.2f}%')

In [None]:
#calculate percent missing values for var1 for each var2 category
percentage_missing = df.groupby('var2 category')['var1'].apply(lambda x: x.isnull().sum() / len(x) * 100)

### Create a bar chart to visualize missing data

In [None]:
#viz by count
missing_data = df.isnull().sum() # count missing values in each column

plt.figure(figsize=(10, 6)) # set figure size

missing_data[missing_data > 0].plot(kind='bar') # plot missing values

plt.title('title') # set title
plt.ylabel('Number of Missing Values') # set y-axis label
plt.xlabel('Column Name') # set x-axis label
plt.show()

In [None]:
#viz by percent missing
plt.figure(figsize=(10, 6))
percentage_missing.plot(kind='bar')
plt.xlabel('x label')
plt.ylabel('y label')
plt.title('title')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

### Create a Heatmap Visualization to Show Missingness Across Unique IDs

In [None]:
plt.figure(figsize=(10, 6)) # set figure size

sns.heatmap(df.isnull(), cbar=False, cmap='viridis') # create heatmap

plt.title('title') # set title
plt.show()

### View Columns where data are missing

In [None]:
df[['Column 1', 'Column 2']][df['Column 1'].isnull()]

### Replace NaNs with other Text/Values

In [None]:
# fill in the missing values for a var
df['Column 1'].fillna('repalcement text or value', inplace=True)
# then check for missing data in that column
df['Column 1'].isnull().sum()

In [None]:
# fill in missing values with NA for all columns starting with

column starts with_cols = [col for col in df.columns if col.startswith('start with text_')]
for col in column starts with_cols:
  df[col].fillna("NA", inplace=True)

df['column name'].isnull().sum()

### Examine Missingness of one variable against another variable

In [None]:
# show the value of var1 where var2 is missing
df[df['var1'].isnull()][['var1', 'var2']]

In [None]:
#create a mask that identifies missing values in a column
missing_var_mask = df['Column 1'].isnull()

#create viz
plt.figure(figsize=(10, 6)) # set figure size

sns.countplot(data = df, x = 'Column 2', hue = missing_var_mask) # create countplot
plt.title('title') # set title
plt.xlabel('Column 2') # set x-axis label
plt.ylabel('Count') # set y-axis label
plt.legend(['Missing', 'Not Missing']) # set legend
plt.xticks(rotation=90) # rotate x-axis labels
plt.show()

### Examine Outliers in Boxplot

In [None]:
df['Column 1'].plot(kind='box')

In [None]:
sns.boxplot(x=df['Column 1'])
plt.title('title')
plt.xlabel('Column 1')
plt.show()

In [None]:
#creates a grid of boxplots for int and float variables
# Select numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Create a grid of boxplots
plt.figure(figsize=(20, 15))
for i, col in enumerate(numerical_cols):
  plt.subplot(5, 5, i+1)
  sns.boxplot(x=df[col])
  plt.title(col)
plt.tight_layout()
plt.show()

### Simple Imputation using Mean, Median, Most Frequent (Mode) value

In [None]:
#create an imputer object that will replace missing values using the mean of the column
imputer = SimpleImputer(strategy='mean')

#creates a copy of the df to avoid modifying the original data
dfi = df.copy()

#computes the mean of Column 1 and stores it in the imputer
imputer.fit(dfi[['Column 1']]) # fit imputer to data

#replaces missing values in Column 1 with the computed mean and assigns the result back to the copy dfi
dfi['Column 1'] = imputer.transform(dfi[['Column 1']]) # transform data

In [None]:
#calculates the median of var_lg
var_lg = df['var_lg'].median()

# Fill the missing values in 'var_lg' with the median
dfs = df.copy()
dfs['var_lg'].fillna(median_var_lg, inplace=True)

# Display missing values after imputation
print("Missing values after imputation:")
print(dfs['var_lg'].isnull().sum())

In [None]:
# Calculates the mode of a var
mode_var = df['var'].mode()[0]

# Fill the missing values in 'var' with the mode
dfs['var'].fillna(mode_var, inplace=True)

# Display missing values after imputation
print("Missing values after imputation:")
print(dfs['avg_var'].isnull().sum())

### Multiple Imputation

In [None]:
#multivariate imputation to handle missing values
dfm = df.copy() # create copy of dataframe

features_for_imputation = ['Column 1', 'Column 2'] # list of features to impute

iter_imputer = IterativeImputer(max_iter = 10, random_state = 0) # create imputer object
dfm[['Column to Impute'] + list(features_for_imputation.columns)] = iter_imputer.fit_transform(dfm[['Column to Impute'] + list(features_for_imputation.columns)]) # fit imputer to data

In [None]:
# Multivariate imputation to handle missing values, leveraging additional features
# Display missing values before imputation
print("Missing values before imputation:")
print(df['var'].isnull().sum())

# Create a copy of df
dfm = df.copy()

# Select columns to use for imputation
features_for_imputation = dfm[['Col1', 'Col2', 'Col3']] # student choose their features here

# Multiple Imputation using additional features
iter_imputer = IterativeImputer(max_iter=10, random_state=0)
dfm[['var'] + list(features_for_imputation.columns)] = iter_imputer.fit_transform(dfm[['var'] + list(features_for_imputation.columns)])

# Display missing values after imputation
print("Missing values after imputation:")
print(dfm['var'].isnull().sum())

### View rows that are outliers

In [None]:
outliers = df[df['outlier column'] > 'outlier cutoff value'].index.tolist()
outliers_df = df.loc[outliers, :]
outliers_df

### Transform Outliers

In [None]:
df['oultier column_lg'] = np.log(df['outlier column'])
df['oultier column_sqrt'] = np.sqrt(df['outlier column'])

### Binning/Discretizing a Continous Variable Column

In [None]:
# discretizes a var in dfs by creating the categories

# Create a function to categorize the var
def categorize_var(var):
  if var == 0:
    return 0
  elif var == 1:
    return 1
  elif var == 2:
    return 2
  elif var == 3:
    return 3
  else:
    return '3+'

# Apply the function to the 'var' column
dfs['var_cat'] = dfs['var'].apply(categorize_var)
#check via a bar chart
dfs['var_cat'].value_counts().plot(kind='bar')

### Drop Columns

In [None]:
df.drop(['column to drop'], axis=1, inplace=True)
df.drop(['column to drop 1', 'column to drop 2'], axis = 1, inplace = True)

### Write Dataframe to CSV

In [None]:
df.to_csv('file name.csv', index=False)

# **Unit 3 - Data Modeling**

## 3.1 - Correlation Analysis and T-Tests

### Import Libraries


In [None]:
import pandas as pd
from scipy.stats import ttest_ind, pearsonr, spearmanr, ttest_1samp, ttest_rel
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests

#If checking for ouliers add below
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

### Produce Descriptive Stats, Dropping out Unnecessary Columns


In [None]:
df.drop(['Column 1', 'Column 2'], axis = 1).describe()
df_corr = df.drop(columns=['var1', 'var2', 'var3']) #drop without describing and assigning to df_corr

### Produce Histogram Grid


In [None]:
df.hist(layout = (2, 2), figsize = (10, 10), bins = 10) # adjust parameters as necessary
plt.show()

df_corr.hist(layout=(6,6), figsize=(16,24), bins=15 #generate histos for vars assigned to df_corr
plt.show()

### Create df to store transformed vars


In [None]:
df_corrT = pd.DataFrame()

### Log or Square Root Transform Variables


In [None]:
df['Column 1_sqrt'] = np.sqrt(df['Column 1'])
df['Column 2_log'] = np.log(df['Column 2'])

#add originally normally distributed var to df_corrT
df_corrT['var'] = df['var']

### Produce heat-mapped correlation matrix


In [None]:
# Pearson's
cor_matrix = df.corr()
cor_matrix.style.background_gradient(cmap = 'seismic', axis = None, vmin = -1, vmax = 1)

#Spearman's
cor_matrix = df_corrT.corr(method = 'spearman') #creates correlation matrix
cor_matrix.style.background_gradient(cmap='seismic', axis=None, vmin=-1, vmax=1) #formatting matrix

### Produce Scatterplot between 2 variables


In [None]:
plt.scatter(df['Column 1'], df['Column 2'])
plt.xlabel('Column 1')
plt.ylabel('Column 2')
plt.title('Scatterplot Title')
plt.show()

### Calculate the Correlation Coefficient, test statistical significance of relationship


In [None]:
#Pearson's
corr, p_value = pearsonr(df['Column 1'], df['Column 2'])

print('Pearson Correlation Coefficient:', corr)
print('p-value:', p_value)

alpha = 0.05
if p_value < alpha:
    print('There is a significant correlation between Column 1 and Column 2.')
else:
    print('There is no significant correlation between Column 1 and Column 2.')

#Spearman's
corr, p_value = spearmanr(df_corrT['Column 1'], df_corrT['Column 2'])

# Print the results
print('Spearman correlation coefficient:', corr)
print('P-value:', p_value)

### Run Spearman Rho Correlation Analysis on Particular Variables


In [None]:
cor_matrix2 = df[['Var 1', 'Var 2', 'Var 3']].corr(method = 'spearman')
cor_matrix2.style.background_gradient(cmap = 'seismic', axis = None, vmin = -1, vmax = 1)

### Aggregate Converging Variables


In [None]:
df['Aggregated_Var'] = df[['Var 1', 'Var 2']].mean(axis = 1)
df.drop(['Var 1', 'Var 2'], axis = 1, inplace = True) # remove non-aggregated versions of vars from dataframe

### Two Sample Independent T-Test


In [None]:
group_1 = df[df['Group'] == 'Group 1']
group_2 = df[df['Group'] == 'Group 2']

t_stat, p_value = ttest_ind(group_1['Var 1'], group_2['Var 1'])

print('t-statistic:', t_stat)
print('p-value:', p_value)

### Paired T-Test


In [None]:
t_stat, p_value = ttest_rel(df['Time 1'], df['Time 2'])

print('t-statistic:', t_stat)
print('p-value:', p_value)

## 3.2 - Association Rules Analysis

### Import Libraries

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules

### Set ID as Index

In [None]:
df_ar = df_ar.set_index('id') # set id as your index
df_ar.head()

### Convert transactional data to dummy variables

In [None]:
data = list(df["Column_Name"].apply(lambda X:x.split(","))) #splits values within `Column_Name` to a list, based on ","

from mlxtend.preprocessing import TransactionEncoder #import transactionencoder function from mlxtend
a = TranactionEncoder() #save function in object called 'a'
a_data = a.fit(data).transform(data) #create new data object that transforms list to dummy variables for each item
df = pd.DataFrame(a_data,columns=a.columns_) #create new dataframe object from the dummy variables, take column names
df

### Replace True/False Values with 1 and 0s using dictionary

In [None]:
df.replace({True: 1, False: 0}, inplace=True)

### If any columns are duplicated, make sure to trim whitespace then drop duplicated columns

In [None]:
df.columns = df.columns.str.replace(' ', '') #trim leading and trailing whitespace from column names
df = df.loc[:, ~df.columns.duplicated()] #drop duplicated columns

### Apply Apriori Function to Generate Frequent Rulesets

In [None]:
freq_is = apriori(df, min_support = 0.2, use_colnames = True) #change the minimum support value
freq_is #show the frequent itemsets table

### Run Association Rules Analysis

In [None]:
df_ar = association_rules(freq_is, metric = "confidence", min_threshold = 0.6) #change minimum_threshold confidence value
df_ar

ar_table = association_rules(freq_is, metric = "confidence", min_threshold = 0.6) #change minimum_threshold confidence value
ar_table

### Calculate support count, support, confidence and lift by formula

In [None]:
sc = df['Column_name'].sum() #calculate support count of itemset containing one item
sc = len(df[(df['Col_1'] == 1) &  (df['Col_3'] == 1)]) #calculate support count of itemset containing 2 items

support = sc/len(df) #divide the support count by the total number of observations in df

In [None]:
sc_x = len(df[(df['Col_1'] == 1) & (df['Col_2'] == 1)]) #support count of antecedent with two items in itemset
sc_xy = len(df[(df['Col_1'] == 1) & (df['Col_2'] == 1) & df['Col_3'] == 1]) #support count of antecendent and consequent

confidence = sc_xy/sc_x #divide support count of antecendent and consequent by the support count of the antecedent

In [None]:
sc_y = df['Col_1'].sum() #support count of consequent with itemset with one item
s_y = sc_y/len(df) #support of consequent

sc_x = len(df[(df['Col_2'] == 1) & (df['Col_3'] == 1)]) #support count of antecedent with two items
sc_xy = len(df[(df['Col_2'] == 1) & (df['Col_3'] == 1) & (df['Col_3'] == 1)]) #calculate support count of ante/cons

confidence = sc_xy/sc_x #confidence of x --> y
Lift = confidence/s_y #lift of x --> y

## 3.3 - Clustering Analysis

### Import Libraries

In [None]:
!pip install kneed # installs the kneed library

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from kneed import KneeLocator
import sklearn.cluster
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.manifold import TSNE

### Select Specific columns and Store as Separate dataframe

In [None]:
# to list attributes you want included
features = df[['col1', 'col2', 'add as necessary']]

# to list attributes you want dropped
features = df.copy()
features = features.drop(['col1', 'col2', 'add as necessary'], axis=1)
features.head()

# to further subset
features = features[['col1', 'col2', 'add as necessary']]
features.head()

### Dummy Code any Categorical Variables

In [None]:
features = pd.get_dummies(data = features, dtype = 'int')
# or
features = pd.get_dummies(features)

### Drop Missing

In [None]:
features = features.dropna(subset=['col to drop na'])

### Scale All Variables in Features

In [None]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
scaled_features

### Initialize KMeans_kwargs Dictionary

In [None]:
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}

### Run KMeans ForLoop for Range of K (x,y)


In [None]:
sse = [] #create empty list for SSE values
for k in range(x, y):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(scaled_features)
    sse.append(kmeans.inertia_)

### Create Elbow Plot of Clusters to Inertia Values

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(range(x, y), sse)
plt.xticks(range(x, y))
plt.xlabel("Number of Clusters")
plt.ylabel("Inertia")
plt.show()

### Locate Knee in Plot

In [None]:
kl = KneeLocator(range(x, y), sse, curve="convex", direction="decreasing")
kl.elbow

### Run KMeans Parameters Including Number of Clusters

In [None]:
kmeans = KMeans(n_clusters = K,
                init = 'random',
                max_iter = 300,
                n_init = 10,
                random_state = 42) # change parameters as necessary

### Fit KMeans to Data, Examine Inertia Value

In [None]:
kmeans.fit(scaled_features)
kmeans.inertia_

### Visualize Clusters with 2D TSNE plot

In [None]:
model = TSNE(n_components = 2, random_state = 42) # change parameters as necessary

transformed_data = model.fit_transform(scaled_features) # fit and transform the data
cluster_labels = kmeans.labels_ # get the cluster labels

plt.figure(figsize = (10, 8)) # set the figure size
sns.scatterplot(x = transformed_data[:, 0], y = transformed_data[:, 1], hue = cluster_labels) # plot the data with cluster labels

plt.title('Title') # set the title
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.legend(title='Cluster')
plt.show()

### Produce Centroid Table (Scaled)

In [None]:
centroids = kmeans.cluster_centers_
centroids_table = pd.DataFrame(centroids, columns = features.columns, index = range(kmeans.n_clusters))
centroids_table

### Produce Centroid Table (Unscaled)

In [None]:
unscaled = scaler.inverse_transform(centroids)
unscaled_table = pd.DataFrame(unscaled, columns = features.columns, index = range(kmeans.n_clusters))
unscaled_table

### Plot Centroids for Set Features

In [None]:
centroids_table.plot(kind = 'line', y = ['col 1', 'col 2', 'col 3']) # scaled line chart
plt.title('Title')
plt.xlabel('Feature')
plt.ylabel('Value')
plt.show()

unscaled_table.plot(kind = 'bar', y = ['col 1', 'col 2', 'col 3']) # unscaled bar chart
plt.title('Title')
plt.xlabel('Feature')
plt.ylabel('Value')
plt.show