In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/test.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/test.csv


In [2]:
file_path = "/kaggle/input/home-data-for-ml-course/train.csv"

In [3]:
df = pd.read_csv(file_path)

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
plt.style.use('ggplot')

# Data cleaning

In [5]:
df.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

# Univeriate analysis

In [6]:
df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice          int64
Length: 81, dtype: object

In [7]:
def unistats(df, sort_by="skew"):
    import pandas as pd
    
    output_df = pd.DataFrame(columns=['count', 'missing', 'unique', 'dtype', 'min', 'max', 'mean', 'median', 'mode', 'std', 'skew', 'kurt', 'numeric'])
    for col in df:
        if pd.api.types.is_numeric_dtype(df[col]):
            output_df.loc[col] = [df[col].count(), df[col].isnull().sum(), df[col].nunique(), df[col].dtype, df[col].min(), df[col].max(), df[col].mean(), df[col].median(), df[col].mode().values[0], df[col].std(), df[col].skew(), df[col].kurt(), pd.api.types.is_numeric_dtype(df[col])]
        else:
            output_df.loc[col] = [df[col].count(), df[col].isnull().sum(), df[col].nunique(), df[col].dtype, '-', '-', '-', '-', df[col].mode().values[0], '-', '-', '-', pd.api.types.is_numeric_dtype(df[col])]
    return output_df.sort_values(by=["numeric", f"{sort_by}"], ascending=False)

In [8]:
pd.set_option('display.max_rows', 100)
unistats(df)

Unnamed: 0,count,missing,unique,dtype,min,max,mean,median,mode,std,skew,kurt,numeric
MiscVal,1460,0,21,int64,0,15500,43.489041,0.0,0,496.123024,24.476794,701.003342,True
PoolArea,1460,0,8,int64,0,738,2.758904,0.0,0,40.177307,14.828374,223.268499,True
LotArea,1460,0,1073,int64,1300,215245,10516.828082,9478.5,7200,9981.264932,12.207688,203.243271,True
3SsnPorch,1460,0,20,int64,0,508,3.409589,0.0,0,29.317331,10.304342,123.662379,True
LowQualFinSF,1460,0,24,int64,0,572,5.844521,0.0,0,48.623081,9.011341,83.234817,True
KitchenAbvGr,1460,0,4,int64,0,3,1.046575,1.0,1,0.220338,4.488397,21.532404,True
BsmtFinSF2,1460,0,144,int64,0,1474,46.549315,0.0,0,161.319273,4.255261,20.113338,True
ScreenPorch,1460,0,76,int64,0,480,15.060959,0.0,0,55.757415,4.122214,18.439068,True
BsmtHalfBath,1460,0,3,int64,0,2,0.057534,0.0,0,0.238753,4.103403,16.396642,True
EnclosedPorch,1460,0,120,int64,0,552,21.95411,0.0,0,61.119149,3.089872,10.430766,True


# Bivariate analysis

In [9]:
def anova(df, feature, label):
    import pandas as pd
    import numpy as np
    from scipy import stats
    
    groups = df[feature].unique()
    df_grouped = df.groupby(feature)
    group_labels = []
    
    for g in groups:
        g_list = df_grouped.get_group(g)
        group_labels.append(g_list[label])
        
    return stats.f_oneway(*group_labels)
    

In [10]:
def bivstats(df, label):
    from scipy import stats
    import pandas as pd
    import numpy as np
    
    output_df = pd.DataFrame(columns=['r', 'F', 'X2', 'p-value'])
    
    for col in df:
        if not col == label:
            if df[col].isnull().sum() == 0:
                if pd.api.types.is_numeric_dtype(df[col]):
                    r,p = stats.pearsonr(df[label], df[col])
                    output_df.loc[col] = [round(r, 3), np.nan, np.nan, round(p, 3)]
                else:
                    F, p = anova(df[[col, label]], col, label)
                    output_df.loc[col] = [np.nan, round(F, 3), np.nan, round(p, 3)]
            else:
                output_df.loc[col] = [np.nan, np.nan, np.nan, "nulls"]
                
    sorted_by_f = output_df.loc[output_df.F.abs().sort_values(ascending=False).index]
    output_df = sorted_by_f.reindex(sorted_by_f.r.abs().sort_values(ascending=False).index)
    
    return output_df

In [11]:
bivstats(df, 'SalePrice')

Unnamed: 0,r,F,X2,p-value
OverallQual,0.791,,,0.0
GrLivArea,0.709,,,0.0
GarageCars,0.64,,,0.0
GarageArea,0.623,,,0.0
TotalBsmtSF,0.614,,,0.0
1stFlrSF,0.606,,,0.0
FullBath,0.561,,,0.0
TotRmsAbvGrd,0.534,,,0.0
YearBuilt,0.523,,,0.0
YearRemodAdd,0.507,,,0.0


# Multivariate analysis

# Train test split

# ML algorithms

# Keras

# PyTorch