# Trying out scikit-learn

In [1]:
import pandas as pd 
import numpy as np

df = pd.read_csv("House-Price-Prediction-Dataset.csv")
df.describe()

Unnamed: 0,Id,Area,Bedrooms,Bathrooms,Floors,YearBuilt,Price
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,1000.5,2786.2095,3.0035,2.5525,1.9935,1961.446,537676.855
std,577.494589,1295.146799,1.424606,1.10899,0.809188,35.926695,276428.845719
min,1.0,501.0,1.0,1.0,1.0,1900.0,50005.0
25%,500.75,1653.0,2.0,2.0,1.0,1930.0,300098.0
50%,1000.5,2833.0,3.0,3.0,2.0,1961.0,539254.0
75%,1500.25,3887.5,4.0,4.0,3.0,1993.0,780086.0
max,2000.0,4999.0,5.0,4.0,3.0,2023.0,999656.0


In [9]:
# Supervised learning with scikit-learn
from sklearn.linear_model import LinearRegression
X = df[['Area', 'Bedrooms']]
y = df['Price']

model = LinearRegression()
model.fit(X, y)

In [11]:
# Cross-validation in scikit-learn

from sklearn.model_selection import cross_val_score, KFold
kf = KFold(n_splits=6, shuffle=True, random_state=42)
reg = LinearRegression()
cv_results = cross_val_score(reg, X, y, cv=kf)
print(cv_results)
print(np.mean(cv_results), np.std(cv_results))
print(np.quantile(cv_results, [0.025, 0.975]))

[-0.0018793  -0.0080055  -0.00319932 -0.0021076  -0.0144256  -0.00399023]
-0.0056012605458171105 0.004436078835313904
[-0.01362309 -0.00190784]


In [14]:
## Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train Size:", len(X_train), "Test Size:", len(X_test))

Train Size: 1600 Test Size: 400


In [15]:
# Regularization in scikit-learn
## Ridge Regression

from sklearn.linear_model import Ridge
scores = []
for alpha in [0.1, 1.0, 10.0, 100.0, 1000.0]:
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_test)
    scores.append(ridge.score(X_test, y_test))
print(scores)

[-0.0007394194244556562, -0.0007394186151157278, -0.0007394105463311984, -0.0007393322464490648, -0.000738731848417995]


In [16]:
## Lasso Regression

from sklearn.linear_model import Lasso
scores = []
for alpha in [0.01, 1.0, 10.0, 20.0, 50.0]:
    lasso = Lasso(alpha=alpha)
    lasso.fit(X_train, y_train)
    lasso_pred = lasso.predict(X_test)
    scores.append(lasso.score(X_test, y_test))
print(scores)

[-0.0007394185945550635, -0.0007393275346778694, -0.0007385002789348505, -0.0007375822829926992, -0.0007364969971945801]


In [17]:
# Logistic Regression in scikit-learn 

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
X_train, X_test, y_trin, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
y_pred_probs = logreg.predict_proba(X_test)[:, 1]
print(y_pred_probs[0])

0.000755064673284445


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
# Plotting the ROC Curve

from sklearn.metrics import roc_curve 
import matplotlib.pyplot as plt 

fpr, tpr, thresholds = roc_curve(X_train, y_train)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.show()

ValueError: multiclass-multioutput format is not supported

In [39]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Ensure y_train contains only 0s and 1s
fpr, tpr, thresholds = roc_curve(y_train, model.predict(X_train)[:, 1])  # Use probabilities for class 1
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed