In [109]:
# IMPORTS

import pandas as pd
import numpy as np
import statistics as stats

In [110]:
# FUNCTIONS

# Describe

def describe_col(df, col):
    print(f'{col.upper()} ({col})')
    print(f'\nGeneral stats')
    print(df[col].describe())
    print(f'\nValue distribution')
    print(df[col].value_counts())
    
def distrib_missing_values(df, cols = list(df.columns)):
    d_missing_values = {'id': [], 'count_missing_values': []}
    for i in df.index:
        d_missing_values['id'].append(i)
        d_missing_values['count_missing_values'].append(df.loc[i, cols].isna().sum())
    df_missing_values = pd.DataFrame.from_dict(d_missing_values)
    print(df_missing_values['count_missing_values'].value_counts())
    return d_missing_values
    
# Filter

def get_null_cols(df, cols = list(df.columns)):
    null_vals_by_col = df[cols].isnull().sum()
    l_null_cols = list(null_vals_by_col[null_vals_by_col > 0].index)
    return l_null_cols

def get_low_variance_cols(df, percentile = 90):
    low_variance = []
    for col in df._get_numeric_data():
        minimum = min(df[col])
        upper_bound = np.percentile(df[col], percentile)
        if upper_bound == minimum:
            low_variance.append(col)
    return low_variance

def get_numeric_columns(df):
    return list(df.select_dtypes('number'))

def get_nonnumeric_columns(df):
    return list(df.select_dtypes('object'))

# Fill

def fill_missing_vals(df, d_filling_meths):
    for meth, l_cols in d_filling_meths.items():
        for col in l_cols:
            print(f'Filling: {col}...')
            df[col].fillna(meth(df[col]), inplace = True)
            
# Drop

def drop_too_many_missing_values(df, d_missing_values, max_missing_values):
    rows_to_drop = [d_missing_values['id'][i] for i in d_missing_values['id'] if d_missing_values['count_missing_values'][i] > max_missing_values]
    print(f'Dropping {len(rows_to_drop)} rows ({len(rows_to_drop) / len(df.index):.1%} of total): {rows_to_drop}.')
    df.drop(df.index[rows_to_drop], inplace = True)
    
def drop_duplicates_from_subset(df, cols_subset = df1.columns):
    before = len(df)
    df = df[cols_subset].drop_duplicates()
    after = len(df)
    nb_drop = before - after
    print(f'Number of duplicate records dropped: {nb_drop} ({nb_drop / len(df): .1%} of total)')
    
# Other

def zerofy(value):
    return 0

def emptyfy(string):
    return ""

In [111]:
# DATA IMPORT
df = pd.read_excel('../data/2 -Entrepreneurial competency in university students.xlsx')
df1 = df.copy()

In [112]:
# DESCRIPTION OF ORIGINAL DATASET
df1.head(5)

Unnamed: 0,EducationSector,Target IndividualProject,Age,Gender,City,Influenced,Perseverance,DesireToTakeInitiative,Competitiveness,SelfReliance,StrongNeedToAchieve,SelfConfidence,GoodPhysicalHealth,MentalDisorder,KeyTraits,ReasonsForLack,Target-ent_competency
0,Engineering Sciences,No,19.0,Male,Yes,No,2.0,2.0,3.0,3.0,2.0,2.0,3.0,Yes,Passion,,1
1,Engineering Sciences,Yes,22.0,Male,No,Yes,3.0,3.0,3.0,4.0,4.0,3.0,4.0,Yes,Vision,Just not interested! (Want to work in the corp...,0
2,Engineering Sciences,No,18.0,Male,Yes,No,3.0,4.0,3.0,3.0,3.0,4.0,4.0,No,Passion,Not willing to start a venture in India and wa...,0
3,Engineering Sciences,Yes,20.0,Male,Yes,Yes,3.0,3.0,3.0,3.0,4.0,3.0,3.0,No,Rrresilience,Not able to take a Financial Risk,0
4,Engineering Sciences,Yes,19.0,Male,Yes,Yes,2.0,3.0,3.0,3.0,4.0,3.0,2.0,Yes,Vision,,1


In [113]:
df1.describe()

Unnamed: 0,Age,Perseverance,DesireToTakeInitiative,Competitiveness,SelfReliance,StrongNeedToAchieve,SelfConfidence,GoodPhysicalHealth,Target-ent_competency
count,205.0,202.0,208.0,210.0,197.0,211.0,212.0,212.0,219.0
mean,19.756098,3.346535,3.591346,3.571429,3.705584,3.914692,3.566038,3.566038,0.415525
std,1.324366,1.001845,1.163587,1.118416,1.066474,1.033866,1.122952,1.105941,0.493941
min,17.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,19.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,0.0
50%,20.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,0.0
75%,20.0,4.0,5.0,4.0,5.0,5.0,4.0,4.0,1.0
max,26.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1.0


In [114]:
# DROP DUPLICATES
l_null_numeric_cols = get_null_cols(df1, get_numeric_columns(df1))
cols_subset = l_null_numeric_cols + ['EducationSector', 'Gender']
drop_duplicates_from_subset(df1, cols_subset)

Number of duplicate records dropped: 2 ( 0.9% of total)


In [115]:
# CHECK LOW VARIANCE COLUMNS
percentile_filter = 90
l_low_variance_cols = get_low_variance_cols(df1, percentile_filter)
print(l_low_variance_cols)

[]


In [116]:
# MISSING NUMERIC VALUES
# Check rows with missing numeric values
d_missing_numeric_values = distrib_missing_values(df1, l_null_numeric_cols)

0    181
1     18
2     11
7      7
3      2
Name: count_missing_values, dtype: int64


In [117]:
# Drop rows with too many missing numeric values
max_missing_values = 2
drop_too_many_missing_values(df1, d_missing_numeric_values, max_missing_values)

Dropping 9 rows (4.1% of total): [76, 82, 198, 199, 200, 201, 202, 203, 204].


In [118]:
# Describe each numeric column with missing values
for col in l_null_numeric_cols:
    describe_col(df1, col)
    print('\n--------------------')

AGE (Age)

General stats
count    196.000000
mean      19.780612
std        1.335062
min       17.000000
25%       19.000000
50%       20.000000
75%       20.000000
max       26.000000
Name: Age, dtype: float64

Value distribution
20.0    67
19.0    61
21.0    27
18.0    18
22.0    12
17.0     6
23.0     2
26.0     1
24.0     1
25.0     1
Name: Age, dtype: int64

--------------------
PERSEVERANCE (Perseverance)

General stats
count    201.000000
mean       3.348259
std        1.004047
min        1.000000
25%        3.000000
50%        3.000000
75%        4.000000
max        5.000000
Name: Perseverance, dtype: float64

Value distribution
4.0    70
3.0    65
2.0    36
5.0    24
1.0     6
Name: Perseverance, dtype: int64

--------------------
DESIRETOTAKEINITIATIVE (DesireToTakeInitiative)

General stats
count    207.000000
mean       3.584541
std        1.162251
min        1.000000
25%        3.000000
50%        4.000000
75%        5.000000
max        5.000000
Name: DesireToTakeInitiativ

In [119]:
# Fill missing values of numeric columns with a specific method
d_filling_numeric_meths = {
    zerofy: [],
    np.mean: [],
    np.nanmedian: ['Age', 'Perseverance', 'DesireToTakeInitiative', 'Competitiveness', 'SelfReliance', 'StrongNeedToAchieve', 'SelfConfidence', 'GoodPhysicalHealth'],
    }

fill_missing_vals(df1, d_filling_numeric_meths)

Filling: Age...
Filling: Perseverance...
Filling: DesireToTakeInitiative...
Filling: Competitiveness...
Filling: SelfReliance...
Filling: StrongNeedToAchieve...
Filling: SelfConfidence...
Filling: GoodPhysicalHealth...


In [120]:
# MISSING NON NUMERIC VALUES
# TBC
l_null_nonnumeric_cols = get_null_cols(df1, get_nonnumeric_columns(df1))
# ReasonsForLack can be empty
try:
    l_null_nonnumeric_cols.remove('ReasonsForLack')
except:
    pass

d_missing_nonnumeric_values = distrib_missing_values(df1, l_null_nonnumeric_cols)

0    199
1     11
Name: count_missing_values, dtype: int64


In [121]:
# Describe each non numeric column with missing values
for col in l_null_nonnumeric_cols:
    describe_col(df1, col)
    print('\n--------------------')

MENTALDISORDER (MentalDisorder)

General stats
count     199
unique      2
top        No
freq      142
Name: MentalDisorder, dtype: object

Value distribution
No     142
Yes     57
Name: MentalDisorder, dtype: int64

--------------------


In [122]:
d_filling_nonnumeric_meths = {
    emptyfy: [],
    stats.mode: ['MentalDisorder']
    }

fill_missing_vals(df1, d_filling_nonnumeric_meths)

Filling: MentalDisorder...


In [123]:
# DATATYPES
# Describe
df1.dtypes

EducationSector               object
Target IndividualProject      object
Age                          float64
Gender                        object
City                          object
Influenced                    object
Perseverance                 float64
DesireToTakeInitiative       float64
Competitiveness              float64
SelfReliance                 float64
StrongNeedToAchieve          float64
SelfConfidence               float64
GoodPhysicalHealth           float64
MentalDisorder                object
KeyTraits                     object
ReasonsForLack                object
Target-ent_competency          int64
dtype: object

In [124]:
# Transform all numeric columns into integers (as they are discreet natural values)
for col in get_numeric_columns(df1):
    df1[col] = df1[col].astype('int64')

In [125]:
# Check results
df1.dtypes

EducationSector              object
Target IndividualProject     object
Age                           int64
Gender                       object
City                         object
Influenced                   object
Perseverance                  int64
DesireToTakeInitiative        int64
Competitiveness               int64
SelfReliance                  int64
StrongNeedToAchieve           int64
SelfConfidence                int64
GoodPhysicalHealth            int64
MentalDisorder               object
KeyTraits                    object
ReasonsForLack               object
Target-ent_competency         int64
dtype: object