In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## STEP 1 : Data Collection

In [2]:
filepath_House_Price = r"C:\Users\aakas\PythonStuff\Regression_udemy\resources\Linear_Regression\House_Price.csv"


In [3]:
houses = pd.read_csv(filepath_House_Price , header = 0 )

In [4]:
houses.head()

Unnamed: 0,price,crime_rate,resid_area,air_qual,room_num,age,dist1,dist2,dist3,dist4,teachers,poor_prop,airport,n_hos_beds,n_hot_rooms,waterbody,rainfall,bus_ter,parks
0,24.0,0.00632,32.31,0.538,6.575,65.2,4.35,3.81,4.18,4.01,24.7,4.98,YES,5.48,11.192,River,23,YES,0.049347
1,21.6,0.02731,37.07,0.469,6.421,78.9,4.99,4.7,5.12,5.06,22.2,9.14,NO,7.332,12.1728,Lake,42,YES,0.046146
2,34.7,0.02729,37.07,0.469,7.185,61.1,5.03,4.86,5.01,4.97,22.2,4.03,NO,7.394,101.12,,38,YES,0.045764
3,33.4,0.03237,32.18,0.458,6.998,45.8,6.21,5.93,6.16,5.96,21.3,2.94,YES,9.268,11.2672,Lake,45,YES,0.047151
4,36.2,0.06905,32.18,0.458,7.147,54.2,6.16,5.86,6.37,5.86,21.3,5.33,NO,8.824,11.2896,Lake,55,YES,0.039474


In [5]:
houses.shape

(506, 19)

## STEP 2 : Package importing 

In [6]:
from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import StandardScaler

# from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.pipeline import Pipeline                
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

## STEP 3 : Pipeline

In [7]:
df = houses.copy()

In [8]:
def num_cat_col_list(DF) : 
    
    from pandas.api.types import is_string_dtype, is_numeric_dtype

    
    num_list = []
    cat_list = []

    for column in DF:

        if is_numeric_dtype(DF[column]):
            num_list.append(column)

        elif is_string_dtype(DF[column]):
            cat_list.append(column)

    return (num_list , cat_list)

In [9]:
num_list , cat_list = num_cat_col_list(DF = df)

print("Numerical column : ",num_list)
print("Categorical column : ",cat_list)

Numerical column :  ['price', 'crime_rate', 'resid_area', 'air_qual', 'room_num', 'age', 'dist1', 'dist2', 'dist3', 'dist4', 'teachers', 'poor_prop', 'n_hos_beds', 'n_hot_rooms', 'rainfall', 'parks']
Categorical column :  ['airport', 'waterbody', 'bus_ter']


### Making changes to num_feat and cat_feat list

In [10]:
# num_list

num_feat = num_list.copy()
num_feat.remove('price')

cat_feat = cat_list.copy()

In [11]:
def pipeline_trans(DF,num_feat ,cat_feat):
    
    # Separating Independent and dependent variables
    X  = DF.drop(labels = ['price' ], axis = 1)
    y  = DF['price']
    
    # Train-Test Split
    X_train,X_test,y_train,y_test = train_test_split(X,y , random_state= 100 ,test_size = 0.2 )
    
    # Pipeline
    ohe = OneHotEncoder()
    
    num_pipeline = Pipeline([('Imputer' , SimpleImputer(strategy='median'))                       
                            ])
    
    cat_pipeline = Pipeline([('Nominal Encoding' ,ohe)])
    
    pipeline = ColumnTransformer([("Num pipeline" , num_pipeline , num_feat),
                                ("Nominal pipeline" , cat_pipeline , cat_feat)
                                ])
    
    # Fit and Transformation
    X_train_tr = pipeline.fit_transform(X_train)
    X_test_tr =  pipeline.transform(X_test)
    
    # Output Columns after tranformation
    out_cols = []

    i= 0  # counter

    for col in pipeline.get_feature_names_out():
        out_cols.append(col.split("__")[-1])
        #out_cols[i] = out_cols[i].split("_")[-1]

        i += 1
    # print(out_cols)
    
    # Conversion to train and test Dataframes
    X_train_tr_df = pd.DataFrame(X_train_tr, columns= out_cols)
    X_test_tr_df = pd.DataFrame(X_test_tr , columns = out_cols)
    
    y_train.reset_index(drop = True , inplace = True)
    y_test.reset_index(drop = True , inplace = True)
    
    df_train_trans = pd.concat([X_train_tr_df,y_train] ,axis = 1 )
    df_test_trans = pd.concat([X_test_tr_df,y_test] ,axis = 1 )
    
    return df_train_trans,df_test_trans

In [12]:
df_train_tr ,df_test_tr = pipeline_trans(DF = df , num_feat = num_feat , cat_feat = cat_feat)

In [13]:
df_train_tr.head(3)

Unnamed: 0,crime_rate,resid_area,air_qual,room_num,age,dist1,dist2,dist3,dist4,teachers,...,rainfall,parks,airport_NO,airport_YES,waterbody_Lake,waterbody_Lake and River,waterbody_None,waterbody_River,bus_ter_YES,price
0,17.8667,48.1,0.671,6.223,100.0,1.48,1.13,1.48,1.46,19.8,...,46.0,0.058067,1.0,0.0,0.0,1.0,0.0,0.0,1.0,10.2
1,0.79041,39.9,0.544,6.122,52.8,2.89,2.34,2.77,2.55,21.6,...,46.0,0.055734,0.0,1.0,0.0,0.0,0.0,1.0,1.0,22.1
2,1.22358,49.58,0.605,6.943,97.4,1.99,1.76,1.92,1.84,25.3,...,57.0,0.054814,0.0,1.0,0.0,0.0,1.0,0.0,1.0,41.3


In [14]:
df_test_tr.head(3)

Unnamed: 0,crime_rate,resid_area,air_qual,room_num,age,dist1,dist2,dist3,dist4,teachers,...,rainfall,parks,airport_NO,airport_YES,waterbody_Lake,waterbody_Lake and River,waterbody_None,waterbody_River,bus_ter_YES,price
0,0.03768,31.52,0.404,7.274,38.3,7.53,7.22,7.46,7.03,27.4,...,43.0,0.041174,1.0,0.0,1.0,0.0,0.0,0.0,1.0,34.6
1,0.44178,36.2,0.504,6.552,21.4,3.56,3.28,3.42,3.23,22.6,...,47.0,0.044352,0.0,1.0,0.0,0.0,1.0,0.0,1.0,31.5
2,0.04527,41.93,0.573,6.12,76.7,2.44,2.11,2.46,2.14,19.0,...,20.0,0.059903,0.0,1.0,0.0,1.0,0.0,0.0,1.0,20.6


## STEP 4 : Export the DF as a .csv file.

In [15]:
D_train = df_train_tr.copy()
D_test = df_test_tr.copy()

In [16]:
D_train.to_csv(r"S1_Part2_Train_DS.csv")

In [17]:
D_test.to_csv(r"S1_Part2_Test_DS.csv")