In [28]:
import numpy as np
from sklearn import datasets
import pandas as pd
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import confusion_matrix, classification_report

# Reading data from a csv and getting simple insights from it

In [29]:
df = pd.read_csv("housing.csv")
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [31]:
df.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity'],
      dtype='object')

# Checking and Dropping null values from the DataFrame

In [32]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [33]:
df.dropna(inplace=True)
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [34]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [35]:
df = pd.read_csv("housing_final.csv")
df

Unnamed: 0.1,Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
1,4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
2,5,-122.25,37.85,52.0,919.0,213.0,413.0,193.0,4.0368,269700.0,NEAR BAY
3,6,-122.25,37.84,52.0,2535.0,489.0,1094.0,514.0,3.6591,299200.0,NEAR BAY
4,10,-122.26,37.85,52.0,2202.0,434.0,910.0,402.0,3.2031,281500.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...,...
12042,20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
12043,20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
12044,20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
12045,20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [37]:
x = df[['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']]
y = df['median_house_value' ]

In [38]:
x

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,52.0,1274.0,235.0,558.0,219.0,5.6431
1,52.0,1627.0,280.0,565.0,259.0,3.8462
2,52.0,919.0,213.0,413.0,193.0,4.0368
3,52.0,2535.0,489.0,1094.0,514.0,3.6591
4,52.0,2202.0,434.0,910.0,402.0,3.2031
...,...,...,...,...,...,...
12042,25.0,1665.0,374.0,845.0,330.0,1.5603
12043,18.0,697.0,150.0,356.0,114.0,2.5568
12044,17.0,2254.0,485.0,1007.0,433.0,1.7000
12045,18.0,1860.0,409.0,741.0,349.0,1.8672


In [39]:
y

0        341300.0
1        342200.0
2        269700.0
3        299200.0
4        281500.0
           ...   
12042     78100.0
12043     77100.0
12044     92300.0
12045     84700.0
12046     89400.0
Name: median_house_value, Length: 12047, dtype: float64

# Splitting data into train and test subsets

To find out more about the function train_test_split and its arguments, refer the docs here: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [40]:
x_train , x_test , y_train , y_test = train_test_split(x, y, test_size= 0.25 , random_state=42)

# Scaling data using Robust Scalar

Here, we have used Robust Scalar to scale the data just as an example, any other scalar can be used instead of this function.
Method to scale will more or less remain similar

In [41]:
ro_scaler=RobustScaler()
x_train=ro_scaler.fit_transform(x_train)
x_test=ro_scaler.fit_transform(x_test)

# Linear Regression

Here, we create the Linear Regression Model we want to train. Refer the Python docs for different functions (such as fit, score, predict, etc.) and their usage.

Linear Regression using scikit-learn docs: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

In [42]:
reg=LinearRegression()

In [43]:
reg.fit(x_train,y_train)

LinearRegression()

In [44]:
print(reg.score(x_test,y_test))
print(reg.intercept_)
print(reg.coef_)

0.45591401858008485
182894.52916030804
[ 27041.86803983 -66017.22107694  61719.18030541 -54674.95418672
  51940.37725023 110328.6980144 ]


In [45]:
d=pd.DataFrame(reg.coef_,x.columns[:],columns=["Coeficient"])
d

Unnamed: 0,Coeficient
housing_median_age,27041.86804
total_rooms,-66017.221077
total_bedrooms,61719.180305
population,-54674.954187
households,51940.37725
median_income,110328.698014


### For Logistic Regression, for a very similar approach, you can check sklearn - Logistic Regression

Find the docs for it here: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

# Getting an inbuilt dataset in Python for practice

We load an inbuilt dataset available in Python for our Logistic Regression example.
We will use the iris dataset

In [46]:
iris_data = pd.DataFrame(datasets.load_iris().data)
iris_data.columns = datasets.load_iris().feature_names
iris_data.head(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [47]:
iris_target = datasets.load_iris().target
print(iris_target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


# Using statsmodels to train models like in R

You can refer the docs for statsmodels glm here: https://www.statsmodels.org/stable/glm.html

In [48]:
import statsmodels.formula.api as smf 
import statsmodels.api as sm

In [49]:
iris_data['class'] = iris_target

In [50]:
iris_data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [51]:
iris_data.rename(columns = {'sepal length (cm)':'slength', 'sepal width (cm)':'swidth', 'petal length (cm)':'plength', 'petal width (cm)':'pwidth'}, inplace = True)

In [52]:
iris_data.rename(columns = {'class':'Y'}, inplace = True)

In [53]:
formula = 'Y ~ slength+swidth+plength+pwidth'

In [54]:
model = smf.glm(formula = formula, data=iris_data)

In [55]:
result = model.fit()
print(result.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      Y   No. Observations:                  150
Model:                            GLM   Df Residuals:                      145
Model Family:                Gaussian   Df Model:                            4
Link Function:               identity   Scale:                        0.048004
Method:                          IRLS   Log-Likelihood:                 17.437
Date:                Tue, 17 Jan 2023   Deviance:                       6.9606
Time:                        18:16:42   Pearson chi2:                     6.96
No. Iterations:                     3   Pseudo R-squ. (CS):              1.000
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.1865      0.205      0.910      0.3

In [56]:
result.deviance

6.960607814504366

In [57]:
result.null_deviance

100.00000000000003