# Mock for OPPE - 1 [Syllabus: Week 1,2,3,4]

# Imports

In [1]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [91]:
from sklearn.datasets import load_breast_cancer, fetch_california_housing, load_diabetes

In [95]:
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV,train_test_split, cross_validate
from sklearn.preprocessing import FunctionTransformer, PolynomialFeatures, StandardScaler,MinMaxScaler
from sklearn.feature_selection import SelectPercentile, mutual_info_regression

In [126]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.linear_model import SGDRegressor, LinearRegression, RidgeCV
from sklearn.pipeline import Pipeline

# Questions

## [Q. 1,2,3,4,5]

In [4]:
X, y = load_breast_cancer(return_X_y=True, as_frame=True)

In [8]:
X.shape

(569, 30)

In [10]:
y.value_counts()

1    357
0    212
Name: target, dtype: int64

In [14]:
print(load_breast_cancer().DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [16]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [17]:
X[["mean perimeter", "mean radius"]].corr()

Unnamed: 0,mean perimeter,mean radius
mean perimeter,1.0,0.997855
mean radius,0.997855,1.0


## [Q. 6]

In [24]:
arr = np.array([[7, 16, 31], [np.nan, np.nan, 66], [12, 5, np.nan],[98, np.nan, 92]])

si = SimpleImputer(strategy="mean")
si.fit_transform(arr)

array([[ 7. , 16. , 31. ],
       [39. , 10.5, 66. ],
       [12. ,  5. , 63. ],
       [98. , 10.5, 92. ]])

## [Q. 7]

In [26]:
transformer = FunctionTransformer(np.log10)
arr = np.array([[1, 1], [2, 3],[10,100]])
transformer.transform(arr)

array([[0.        , 0.        ],
       [0.30103   , 0.47712125],
       [1.        , 2.        ]])

## [Q. 8]

In [36]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
columns = ['sepal length', 'sepal width', 'petal length', 'petal width','label']
df = pd.read_csv(url,  names=columns)
df.head(3)

Unnamed: 0,sepal length,sepal width,petal length,petal width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa


In [37]:
df.shape

(150, 5)

In [39]:
df.columns

Index(['sepal length', 'sepal width', 'petal length', 'petal width', 'label'], dtype='object')

In [40]:
X = df[['sepal length', 'sepal width', 'petal length', 'petal width']]

In [47]:
poly = PolynomialFeatures(degree=2)
X_new = poly.fit_transform(X)
X_new.shape

(150, 15)

## [Q. 9,10,11,12,13,14]

In [66]:
X, y = fetch_california_housing(return_X_y=True, as_frame=True)

In [52]:
sp = SelectPercentile(mutual_info_regression, percentile=10)
X_new = sp.fit_transform(X,y)
X_new.shape

(20640, 1)

In [67]:
X = X.to_numpy()
y = y.to_numpy()

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

In [63]:
X_train.shape

(16512, 8)

In [69]:
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X, y)
y_pred = dummy_regr.predict(X)
dummy_regr.score(X, y)

0.0

In [71]:
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train, y_train)
y_pred = dummy_regr.predict(X_test)
# dummy_regr.score(X, y)

In [72]:
r2_score(y_test, y_pred)

-0.00011336838325370913

In [81]:
pipe = Pipeline(steps=[("scalar", MinMaxScaler()),
                 ("sgd", SGDRegressor())])
pipe.fit(X_train, y_train)

Pipeline(steps=[('scalar', MinMaxScaler()), ('sgd', SGDRegressor())])

In [83]:
mean_absolute_error(y_train, pipe.predict(X_train))

0.5602120307861166

In [84]:
mean_absolute_error(y_test, pipe.predict(X_test))

0.5661399051973017

In [89]:
scores = cross_validate(LinearRegression(), X, y, cv=5,
                        return_train_score=True)

In [90]:
scores

{'fit_time': array([1.25497293, 0.00596666, 0.00500607, 0.00400567, 0.0060029 ]),
 'score_time': array([0.00098109, 0.        , 0.00099564, 0.00099468, 0.00099683]),
 'test_score': array([0.54866323, 0.46820691, 0.55078434, 0.53698703, 0.66051406]),
 'train_score': array([0.59610368, 0.63332765, 0.61097988, 0.61062026, 0.58443293])}

In [93]:
X, y = load_diabetes(return_X_y=True, as_frame=True)

In [94]:
X.shape

(442, 10)

## [Q. 16,17]

In [115]:
X, y = load_diabetes(return_X_y=True, as_frame=True)

In [116]:
X = X.to_numpy()
y = y.to_numpy()

In [117]:
X_new = StandardScaler().fit_transform(X)

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=10)

In [119]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)

In [120]:
r2_score(y_test, y_pred)

0.5341962544929233

In [124]:
load_diabetes().feature_names

['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']

In [125]:
load_diabetes().feature_names.index("s3")

6

In [121]:
list(lin_reg.coef_)

[-0.18510224640180026,
 -10.732067597123445,
 24.633765116434127,
 15.61665272926938,
 -34.59096797705119,
 19.54775403591227,
 3.8178648235596246,
 10.378115499204807,
 33.499209786057854,
 1.9036748786574789]

In [122]:
list(lin_reg.coef_)[6]

3.8178648235596246

## [Q. 18]

In [143]:
alphas = np.logspace(-4, 0, num=20)

In [144]:
ridge = RidgeCV(alphas=alphas)
cross_validate(ridge, X,y,cv=5, scoring="neg_mean_absolute_error")

{'fit_time': array([0.22999239, 0.00700045, 0.00399733, 0.00400162, 0.00399685]),
 'score_time': array([0.00099897, 0.00099802, 0.00099897, 0.        , 0.        ]),
 'test_score': array([-43.71712551, -45.01395265, -48.00824154, -42.6564468 ,
        -42.97865081])}

In [145]:
s = [-43.71712551, -45.01395265, -48.00824154, -42.6564468 ,-42.97865081]
sum(s)/len(s)

-44.474883462