In [36]:
import pandas as pd
import numpy as np

In [37]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, r2_score
from sklearn import metrics
import missingno as msno
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [38]:
data1 = pd.read_csv('/content/heart.csv')
data2 = pd.read_csv('/content/HousePricePrediction.csv')

In [39]:
data1.sample(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
215,43,0,0,132,341,1,0,136,1,3.0,1,0,3,0
271,61,1,3,134,234,0,1,145,0,2.6,1,2,2,0
103,42,1,2,120,240,1,1,194,0,0.8,0,0,3,1
238,77,1,0,125,304,0,0,162,1,0.0,2,3,2,0
68,44,1,1,120,220,0,1,170,0,0.0,2,0,2,1


In [40]:
data1.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [41]:
data1.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [42]:
x_train, x_test, y_train, y_test = train_test_split(data1.drop(columns = ['target']), data1['target'], test_size=0.2,random_state=109)

In [43]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

In [44]:
stdScaler = StandardScaler()
scaled = stdScaler.fit(x_train)

In [45]:
x_train = scaled.transform(x_train)
x_test = scaled.transform(x_test)

In [46]:
def accuracy(Y, YPred):
  return len(np.where(Y == YPred)[0]) / len(Y)

In [47]:
def sigmoid(x):
  return 1/(1 + np.exp(-x))

def logisticGradient(f_wb, X, Y):
  m = X.shape[0]
  err  = f_wb  - Y
  dw = X.T @ err
  db = np.ones((1, m)) @ err
  return dw, db

In [48]:
def logisticGradientDescent(X, Y, learningRate = 0.01, epochs = 1000):
  m, n = X.shape
  W = np.zeros(n)   # Initialize W as zero vector of length n
  b = 0   # Initialize b as zero
  costs = []    # list for cost

  for i in range(epochs):
    f_wb = sigmoid((X @ W ) + b)
    dw, db = logisticGradient(f_wb, X, Y)   # Gradient with respect to W and b
    W -= learningRate * dw
    b -= learningRate * db
    learningRate /= (1 + 0.0002)

  return W, b

In [49]:
logisticW, logisticb = logisticGradientDescent(x_train, y_train, learningRate = 0.001, epochs = 250)
print(logisticW, logisticb)

[-0.18377719 -0.82981307  0.737889   -0.36468895 -0.22839733 -0.01042742
  0.2047355   0.48417506 -0.47193572 -0.49901392  0.35843209 -0.77064891
 -0.56431894] [0.0585972]


In [50]:
y_pred = sigmoid((x_test @ logisticW) + logisticb)
for i in range(len(y_pred)):
  if y_pred[i] > 0.5:
    y_pred[i] = 1
  else:
    y_pred[i] = 0
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.80      0.82        25
           1       0.86      0.89      0.88        36

    accuracy                           0.85        61
   macro avg       0.85      0.84      0.85        61
weighted avg       0.85      0.85      0.85        61



In [51]:
model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.83      0.80      0.82        25
           1       0.86      0.89      0.88        36

    accuracy                           0.85        61
   macro avg       0.85      0.84      0.85        61
weighted avg       0.85      0.85      0.85        61



In [52]:
data2.sample(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
667,668,20,RL,65.0,8125,Pave,,Reg,Lvl,AllPub,...,0,,,,0,10,2008,WD,Normal,193500
1037,1038,60,RL,,9240,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,287000
639,640,120,RL,53.0,3982,Pave,,Reg,Lvl,AllPub,...,0,,,,0,10,2006,New,Partial,264561
949,950,20,RL,78.0,9360,Pave,,Reg,Lvl,AllPub,...,0,,,,0,3,2010,WD,Normal,197500
1089,1090,120,FV,37.0,3316,Pave,Pave,IR1,Lvl,AllPub,...,0,,,,0,4,2006,WD,Normal,197000


In [53]:
data2.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [54]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [55]:
df2 = data2.copy()
df2 = df2.drop(columns = ['LotFrontage','OverallCond','MasVnrType','PoolQC','MiscFeature','GarageCond','GarageQual','GarageCars','GarageFinish','GarageType','HalfBath','BsmtHalfBath','MasVnrType','BsmtUnfSF','BsmtFinSF2','BsmtFinType2','BsmtFinSF1','BsmtFinType1','BsmtExposure','BsmtCond'])

In [56]:
df2.corr()

  df2.corr()


Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,TotalBsmtSF,1stFlrSF,2ndFlrSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
Id,1.0,0.011156,-0.033226,-0.028365,-0.012713,-0.021998,-0.050298,-0.015415,0.010496,0.00559,...,-0.029643,-0.000477,0.002889,-0.046635,0.00133,0.057044,-0.006242,0.021172,0.000712,-0.021917
MSSubClass,0.011156,1.0,-0.139781,0.032628,0.02785,0.040581,0.022936,-0.238518,-0.251758,0.307886,...,-0.012579,-0.0061,-0.012037,-0.043825,-0.02603,0.008283,-0.007683,-0.013585,-0.021407,-0.084284
LotArea,-0.033226,-0.139781,1.0,0.105806,0.014228,0.013788,0.10416,0.260833,0.299475,0.050986,...,0.171698,0.084774,-0.01834,0.020423,0.04316,0.077672,0.038068,0.001205,-0.014261,0.263843
OverallQual,-0.028365,0.032628,0.105806,1.0,0.572323,0.550684,0.411876,0.537808,0.476224,0.295493,...,0.238923,0.308819,-0.113937,0.030371,0.064886,0.065166,-0.031406,0.070815,-0.027347,0.790982
YearBuilt,-0.012713,0.02785,0.014228,0.572323,1.0,0.592855,0.315707,0.391452,0.281986,0.010308,...,0.22488,0.188686,-0.387268,0.031355,-0.050364,0.00495,-0.034383,0.012398,-0.013618,0.522897
YearRemodAdd,-0.021998,0.040581,0.013788,0.550684,0.592855,1.0,0.179618,0.291066,0.240379,0.140024,...,0.205726,0.226298,-0.193919,0.045286,-0.03874,0.005829,-0.010286,0.02149,0.035743,0.507101
MasVnrArea,-0.050298,0.022936,0.10416,0.411876,0.315707,0.179618,1.0,0.363936,0.344501,0.174561,...,0.159718,0.125703,-0.110204,0.018796,0.061466,0.011723,-0.029815,-0.005965,-0.008201,0.477493
TotalBsmtSF,-0.015415,-0.238518,0.260833,0.537808,0.391452,0.291066,0.363936,1.0,0.81953,-0.174512,...,0.232019,0.247264,-0.095478,0.037384,0.084489,0.126053,-0.018479,0.013196,-0.014969,0.613581
1stFlrSF,0.010496,-0.251758,0.299475,0.476224,0.281986,0.240379,0.344501,0.81953,1.0,-0.202646,...,0.235459,0.211671,-0.065292,0.056104,0.088758,0.131525,-0.021096,0.031372,-0.013604,0.605852
2ndFlrSF,0.00559,0.307886,0.050986,0.295493,0.010308,0.140024,0.174561,-0.174512,-0.202646,1.0,...,0.092165,0.208026,0.061989,-0.024358,0.040606,0.081487,0.016197,0.035164,-0.0287,0.319334


In [57]:
df2 = df2.drop(columns = ['GarageYrBlt', 'Id'])

In [58]:
cat_cols = df2.select_dtypes(include=['object'])
num_cols = df2.select_dtypes(include=['int64', 'float64'])

In [59]:
df2[cat_cols.columns] = df2[cat_cols.columns].fillna('None')

In [60]:
df2 = df2.dropna()

In [61]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for i in cat_cols.columns:
    df2[i] = le.fit_transform(df2[i])

In [62]:
corr = df2.corr()['SalePrice']
selected = corr[abs(corr) > 0.4]
selected

OverallQual     0.789997
YearBuilt       0.522896
YearRemodAdd    0.507158
MasVnrArea      0.477493
ExterQual      -0.633635
BsmtQual       -0.592286
TotalBsmtSF     0.612971
HeatingQC      -0.400075
1stFlrSF        0.606849
GrLivArea       0.710080
FullBath        0.562491
KitchenQual    -0.586597
TotRmsAbvGrd    0.536311
Fireplaces      0.468930
GarageArea      0.622492
SalePrice       1.000000
Name: SalePrice, dtype: float64

In [63]:
df2 = df2[selected.index.to_list()]
df2 = df2.drop(columns = ['BsmtQual'])

In [64]:
df2

Unnamed: 0,OverallQual,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,TotalBsmtSF,HeatingQC,1stFlrSF,GrLivArea,FullBath,KitchenQual,TotRmsAbvGrd,Fireplaces,GarageArea,SalePrice
0,7,2003,2003,196.0,2,856,0,856,1710,2,2,8,0,548,208500
1,6,1976,1976,0.0,3,1262,0,1262,1262,2,3,6,1,460,181500
2,7,2001,2002,162.0,2,920,0,920,1786,2,2,6,1,608,223500
3,7,1915,1970,0.0,3,756,2,961,1717,1,2,7,1,642,140000
4,8,2000,2000,350.0,2,1145,0,1145,2198,2,2,9,1,836,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,6,1999,2000,0.0,3,953,0,953,1647,2,3,7,1,460,175000
1456,6,1978,1988,119.0,3,1542,4,2073,2073,2,3,7,2,500,210000
1457,7,1941,2006,0.0,0,1152,0,1188,2340,2,2,9,2,252,266500
1458,5,1950,1996,0.0,3,1078,2,1078,1078,1,2,5,0,240,142125


In [65]:
df2['ones'] = 1
X = df2.drop('SalePrice', axis = 1)
y = df2['SalePrice']

In [66]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=0)

In [67]:
x_train, x_test = x_train.to_numpy(), x_test.to_numpy()

In [68]:
from sklearn.preprocessing import MinMaxScaler
scaler2 = MinMaxScaler()
x_train = scaler2.fit_transform(x_train)
x_test = scaler2.transform(x_test)

In [69]:
def rmse(yPred,y):
  return np.sqrt(np.mean((yPred - y)**2))

In [None]:
U = np.linalg.inv(x_train.T @ x_train) @ (x_train.T @ y_train)
y_pred = x_test @ U
test_rmse = rmse(y_pred,y_test)
print('Test rmse:',test_rmse)
print('r2 score: ', r2_score(y_test, y_pred))

In [None]:
model = LogisticRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
test_rmse = rmse(y_pred,y_test)
print('Test rmse:',test_rmse)