In [1]:
import numpy as np
import pandas as pd

In [2]:
# Importing data
ic_house_pred_train = pd.read_csv('data/ic_house_pred_train.csv')
ic_house_pred_test = pd.read_csv('data/ic_house_pred_test.csv')

In [3]:
# Separating input and output, and dropping columns

train_db = ic_house_pred_train.drop(['Id'], axis=1)
test_db = ic_house_pred_test.drop(['Id'], axis=1)

In [4]:
ic_house_pred_train.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,136,20,RL,80.0,10400,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,5,2008,WD,Normal,174000
1,1453,180,RM,35.0,3675,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2006,WD,Normal,145000
2,763,60,FV,72.0,8640,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2010,Con,Normal,215200
3,933,20,RL,84.0,11670,Pave,,IR1,Lvl,AllPub,...,0,,,,0,3,2007,WD,Normal,320000
4,436,60,RL,43.0,10667,Pave,,IR2,Lvl,AllPub,...,0,,,,0,4,2009,ConLw,Normal,212000


In [5]:
numeric_columns = train_db.select_dtypes(include=np.number).columns
categoric_columns = [x for x in train_db.columns if x not in numeric_columns]

In [6]:
# Treating missing values

print("Numeric missing values from train database")
print(train_db[numeric_columns].isnull().sum().sum())

print("Numeric missing values from test database")
print(test_db[numeric_columns].isnull().sum().sum())

print("Categoric missing values from train database")
print(train_db[categoric_columns].isnull().sum().sum())

print("Categoric missing values from test database")
print(test_db[categoric_columns].isnull().sum().sum())


def replaceMissingValuesByMean(var_list, data):
    for var in var_list:
        avg = data[var].mean(axis=0)
        data[var].fillna(avg, inplace=True)


# def replaceMissingValuesByMode(var_list, data):
#     for var in var_list:
#         mode = data[var].mode().iloc[0]
#         data[var].fillna(mode, inplace=True)


# Replacing numeric columns
replaceMissingValuesByMean(numeric_columns, train_db)
replaceMissingValuesByMean(numeric_columns, test_db)

# Replacing categoric columns
# replaceMissingValuesByMode(categoric_columns, train_db)
# replaceMissingValuesByMode(categoric_columns, test_db)

Numeric missing values from train database
247
Numeric missing values from test database
101
Categoric missing values from train database
4612
Categoric missing values from test database
2005


In [7]:
# concat train and test

df_all = pd.concat([train_db, test_db], axis=0, sort = False)

In [8]:
# Creating dummies variables

def mostFrequentFeatures(df):
    features = {}

    for col in list(categoric_columns):
        counts = df[col].value_counts(normalize=True)
        columns_until_percent = counts[counts.cumsum() <= 0.7].index
        features[col] = df[col].isin(columns_until_percent)

    return features


def groupDummiesVariables(df, features):
    final_df = None

    for col in list(categoric_columns):
        df[col][~features[col]] = "outros"
        final_df = pd.concat([final_df, pd.get_dummies(df[col], prefix=col)], axis=1, sort=False)
        df = df.drop([col], axis=1)

    df = pd.concat([df, final_df], axis=1, sort=False)

    return df


features = mostFrequentFeatures(df_all)
df_all = groupDummiesVariables(df_all, features)

print("Quantidade de features: " + str(len(df_all.columns)))

Quantidade de features: 112


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][~features[col]] = "outros"


In [9]:
# Separating test and train bases, and generating respectives csvs

from sklearn.model_selection import train_test_split

train_db, test_db = train_test_split(df_all, test_size=0.3, shuffle=False)

train_db.to_csv("TRAIN_DB.csv", index=None)
test_db.to_csv("TEST_DB.csv", index=None)