In [9]:
import pandas as pd

datafile = "Finance_Trends.csv"
fileType = datafile.split('.')[-1].lower()

if fileType == "csv":
    inputTable = pd.read_csv(datafile)
elif fileType in ["xlsx", "xls"]:
    inputTable = pd.read_excel(datafile,sheet_name=1)

targetDataFrame = inputTable.copy()

In [10]:
#Randomly Remove Values and create outliers

import numpy as np

numericColumns = inputTable.select_dtypes(include='number').columns.tolist()

#Randomly remove 20% of the data
for column in inputTable:
    randomCells = np.random.rand(len(inputTable)) < 0.2
    inputTable.loc[randomCells, column] = np.nan

#Randomly create outliers of 10% of the data
for column in inputTable.select_dtypes(include='number'):
    randomCells = np.random.rand(len(inputTable)) < 0.1
    inputTable.loc[randomCells, column] = inputTable.loc[randomCells, column] * 100

ruinedDataFrame = inputTable.copy()
ruinedDataFrame.head(5)

Unnamed: 0,gender,age,Investment_Avenues,Mutual_Funds,Equity_Market,Debentures,Government_Bonds,Fixed_Deposits,PPF,Gold,...,Duration,Invest_Monitor,Expect,Avenue,What are your savings objectives?,Reason_Equity,Reason_Mutual,Reason_Bonds,Reason_FD,Source
0,Male,29.0,Yes,,4.0,7.0,5.0,,1.0,6.0,...,Less than 1 year,Weekly,20%-30%,Public Provident Fund,Health Care,Dividend,,Assured Returns,Fixed Returns,Newspapers and Magazines
1,Male,28.0,Yes,2.0,3.0,6.0,5.0,1.0,400.0,7.0,...,,,30%-40%,,Retirement Plan,Capital Appreciation,,Assured Returns,,Television
2,Female,28.0,Yes,,,700.0,5.0,4.0,100.0,6.0,...,3-5 years,Daily,10%-20%,Mutual Fund,Health Care,,Better Returns,,Risk Free,Financial Consultants
3,Male,19.0,,2.0,3.0,7.0,400.0,6.0,,5.0,...,More than 5 years,Monthly,10%-20%,Equity,Retirement Plan,Capital Appreciation,Better Returns,Assured Returns,Risk Free,Newspapers and Magazines
4,Male,32.0,Yes,,4.0,7.0,,,,6.0,...,More than 5 years,Weekly,10%-20%,Public Provident Fund,Retirement Plan,Capital Appreciation,Better Returns,Assured Returns,,


In [11]:
#Transformation Table

loweredColumns = []
originalColumns = inputTable.columns.tolist()
inputTable.columns = [column.lower() for column in inputTable.columns]
for x in range(len(originalColumns)):
    if originalColumns[x] != inputTable.columns[x]:
        loweredColumns.append(inputTable.columns[x])

numericColumns = []
yesAndNoColumns = []

#Find numeric columns by checking top ten values for numeric data
for column in inputTable:
    topTenValues = inputTable[column].dropna().head(10)
    numericCount = pd.to_numeric(topTenValues, errors='coerce').notna().sum()

    if numericCount > 0:
        numericColumns.append(column)

    yesNoCount = topTenValues.isin(["yes", "no", "Yes","No"]).sum()
    if yesNoCount > 0:
        yesAndNoColumns.append(column)
        
#Get rid of exceptional values
columnsWithOutliers = []
for column in numericColumns:
    inputTable[column] = pd.to_numeric(inputTable[column], errors="coerce")

    #Find the lower quantile of the column values
    lowerQuantile = inputTable[column].quantile(0.25)
    
    #Find the upper quantile of the values in each column
    upperQuantile = inputTable[column].quantile(0.75)
    
    #Find the inter quantile range of the values in each column
    interQuantileRange = upperQuantile - lowerQuantile

    lowerBound = lowerQuantile - 1.5 * interQuantileRange
    upperBound = upperQuantile + 1.5 * interQuantileRange

    #Seperate outliers using the bounds
    outliers = (inputTable[column] < lowerBound) | (inputTable[column] > upperBound)

    if outliers.any():
        columnsWithOutliers.append(column)
    
    #Divide outliers by 100
    inputTable.loc[outliers, column] =  inputTable.loc[outliers, column] / 100

for column in yesAndNoColumns:
    inputTable[column] = inputTable[column].map({"yes": 1, "no": 0,"Yes": 1, "No": 0})

inputTable.head(10)

Unnamed: 0,gender,age,investment_avenues,mutual_funds,equity_market,debentures,government_bonds,fixed_deposits,ppf,gold,...,duration,invest_monitor,expect,avenue,what are your savings objectives?,reason_equity,reason_mutual,reason_bonds,reason_fd,source
0,Male,29.0,1.0,,4.0,7.0,5.0,,1.0,6.0,...,Less than 1 year,Weekly,20%-30%,Public Provident Fund,Health Care,Dividend,,Assured Returns,Fixed Returns,Newspapers and Magazines
1,Male,28.0,1.0,2.0,3.0,6.0,5.0,1.0,4.0,7.0,...,,,30%-40%,,Retirement Plan,Capital Appreciation,,Assured Returns,,Television
2,Female,28.0,1.0,,,7.0,5.0,4.0,1.0,6.0,...,3-5 years,Daily,10%-20%,Mutual Fund,Health Care,,Better Returns,,Risk Free,Financial Consultants
3,Male,19.0,,2.0,3.0,7.0,4.0,6.0,,5.0,...,More than 5 years,Monthly,10%-20%,Equity,Retirement Plan,Capital Appreciation,Better Returns,Assured Returns,Risk Free,Newspapers and Magazines
4,Male,32.0,1.0,,4.0,7.0,,,,6.0,...,More than 5 years,Weekly,10%-20%,Public Provident Fund,Retirement Plan,Capital Appreciation,Better Returns,Assured Returns,,
5,Male,,1.0,,4.0,7.0,5.0,3.0,1.0,6.0,...,Less than 1 year,Weekly,,Mutual Fund,Health Care,Dividend,Fund Diversification,Assured Returns,Fixed Returns,
6,Male,27.0,1.0,3.0,2.0,7.0,4.0,5.0,1.0,6.0,...,3-5 years,Daily,,Mutual Fund,,,Better Returns,,Risk Free,
7,Male,30.0,1.0,3.0,4.0,6.0,5.0,2.0,,7.0,...,Less than 1 year,Weekly,,Public Provident Fund,Retirement Plan,Capital Appreciation,Better Returns,Assured Returns,Risk Free,Financial Consultants
8,Female,,,2.0,4.0,,5.0,3.0,1.0,6.0,...,3-5 years,,10%-20%,Equity,Retirement Plan,Capital Appreciation,Better Returns,Assured Returns,,
9,Female,37.0,,2.0,4.0,7.0,5.0,3.0,1.0,,...,More than 5 years,Monthly,30%-40%,Mutual Fund,Retirement Plan,Capital Appreciation,,Assured Returns,Risk Free,


In [12]:
#Fill in empty values
#Then: Convert numeric columns with no decimals into integers

idColumnsReset = []
for column in inputTable:
    #For id columns, reset the range to fix missing values 
    if "id" in column.lower():
        inputTable[column] = range(1, len(inputTable) + 1)
        idColumnsReset.append(column)
    #For numeric columns, find the median and fill NA values
    elif column in numericColumns:
        median = pd.to_numeric(inputTable[column], errors="coerce").median()
        inputTable[column] = inputTable[column].fillna(median)
    #For categorical columns, find the mode value and fill NA values
    else:
        mode = inputTable[column].mode(dropna=True)
        if not mode.empty:
            inputTable[column] = inputTable[column].fillna(mode[0])

numericToInteger = []
for column in numericColumns:
    #Make sure all values are numeric
    numericColumn = pd.to_numeric(inputTable[column], errors="coerce")
    #Check if all values are integers 
    noDecimals = (numericColumn.dropna() % 1 == 0).all()
    if noDecimals == True:
        inputTable[column] = numericColumn.astype("Int64")
        numericToInteger.append(column)
    else:
        inputTable[column] = numericColumn


In [13]:
#Create Transformation Table

#Each transformation has it's own column, then, for each column, add an entry with value based upon is the 
#column is in each transformation list

#Initialize empty df with the columns you want
transformationTable = pd.DataFrame(columns=[
    "Column Name",
    "Lowercased",
    "Reset ID Column",
    "Fill in NA with Median",
    "Fill in NA with Mode",
    "Changed Type to Integer",
    "Swap Yes/No Values to 1/0",
    "Exceptional Values Cleaned",
])

for column in inputTable:
    inputRow = {
        "Column Name" : column,
        "Lowercased" : "No",
        "Reset ID Column" : "No",
        "Fill in NA with Median" : "No",
        "Fill in NA with Mode" : "No",
        "Changed Type to Integer" : "No",
        "Swap Yes/No Values to 1/0" : "No",
        "Exceptional Values Cleaned" : "No",
    }

    if column in loweredColumns:
        inputRow["Lowercased"] = "Yes"

    if column in idColumnsReset:
        inputRow["Reset ID Column"] = "Yes"

    if column in numericColumns:
        inputRow["Fill in NA with Median"] = "Yes"
    else:
        inputRow["Fill in NA with Mode"] = "Yes"

    if column in numericToInteger:
        inputRow["Changed Type to Integer"] = "Yes"

    if column in yesAndNoColumns:
        inputRow["Swap Yes/No Values to 1/0"] = "Yes"

    if column in columnsWithOutliers:
        inputRow["Exceptional Values Cleaned"] = "Yes"

    transformationTable = pd.concat([transformationTable, pd.DataFrame([inputRow])], ignore_index=True)

transformationTable.head(5)

Unnamed: 0,Column Name,Lowercased,Reset ID Column,Fill in NA with Median,Fill in NA with Mode,Changed Type to Integer,Swap Yes/No Values to 1/0,Exceptional Values Cleaned
0,gender,No,No,No,Yes,No,No,No
1,age,No,No,Yes,No,Yes,No,Yes
2,investment_avenues,Yes,No,No,Yes,No,Yes,No
3,mutual_funds,Yes,No,Yes,No,No,No,Yes
4,equity_market,Yes,No,Yes,No,No,No,Yes
