In [1]:
import pandas as pd
import numpy as np

1. Take one of the supervised learning models you have built recently and apply at least
three dimensionality reduction techniques to it (separately). Be sure to create a short
summary of each technique you use. Indicate how each changed the model
performance. Reference:
https://machinelearningmastery.com/dimensionality-reduction-algorithms-with-python/

In [2]:
abalone_df = pd.read_csv("../Datasets/abalone/abalone.data")
abalone_df.columns = ['Sex','Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings']

In [3]:
# Preprocessing

abalone_df = pd.get_dummies(abalone_df,prefix=["Sex"], columns = ["Sex"], drop_first=True)

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

pipe = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('scaler', StandardScaler()),
    ])

X = abalone_df.drop('Rings',axis=1)
y = abalone_df['Rings']

#train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3 ,random_state=25)

X_train = pipe.fit_transform(X_train)

### SVD - Singular Value Decomposition
The scikit-learn library provides the TruncatedSVD class implementation of Singular Value Decomposition that can be used as a dimensionality reduction data transform. The “n_components” argument can be set to configure the number of desired dimensions in the output of the transform.

In [4]:
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

svd = TruncatedSVD(n_components=6)

X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.fit_transform(X_test)

knr = KNeighborsRegressor()

parameters = {"n_neighbors": range(2, 40),
              "weights": ["uniform", "distance"]}

gridsearch_knr = GridSearchCV(knr, parameters, cv=10)
gridsearch_knr.fit(X_train_svd, y_train)

print("Best GridSearch params:", gridsearch_knr.best_estimator_)

model_svd = gridsearch_knr.best_estimator_
y_pred_svd = model_svd.predict(X_test_svd)
print("RMSE :",np.sqrt(mean_squared_error(y_test, y_pred_svd)))
mean_absolute_error(y_test, y_pred_svd)

Best GridSearch params: KNeighborsRegressor(n_neighbors=20, weights='distance')
RMSE : 3.044683136322494


2.3909188243628825

### LDA - Linear Discriminant Analysis
The scikit-learn library provides the LinearDiscriminantAnalysis class implementation of Linear Discriminant Analysis that can be used as a dimensionality reduction data transform. The “n_components” argument can be set to configure the number of desired dimensions in the output of the transform.

In [5]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components=6)

X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.fit_transform(X_test, y_test)

knr = KNeighborsRegressor()

parameters = {"n_neighbors": range(2, 40),
              "weights": ["uniform", "distance"]}

gridsearch_knr = GridSearchCV(knr, parameters, cv=10)
gridsearch_knr.fit(X_train_lda, y_train)

print("Best GridSearch params:", gridsearch_knr.best_estimator_)

model_lda = gridsearch_knr.best_estimator_
y_pred_lda = model_lda.predict(X_test_lda)
print("RMSE :",np.sqrt(mean_squared_error(y_test, y_pred_lda)))
mean_absolute_error(y_test, y_pred_lda)

Best GridSearch params: KNeighborsRegressor(n_neighbors=26, weights='distance')
RMSE : 3.070055907197659


2.3248085118585498

### PCA - Principal Component Analysis
The scikit-learn library provides the PCA class implementation of Principal Component Analysis that can be used as a dimensionality reduction data transform. The “n_components” argument can be set to configure the number of desired dimensions in the output of the transform.

In [6]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 6, random_state=25)

X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.fit_transform(X_test)

knr = KNeighborsRegressor()

parameters = {"n_neighbors": range(2, 40),
              "weights": ["uniform", "distance"]}

gridsearch_knr = GridSearchCV(knr, parameters, cv=10)
gridsearch_knr.fit(X_train_pca, y_train)

print("Best GridSearch params:", gridsearch_knr.best_estimator_)

model_pca = gridsearch_knr.best_estimator_
y_pred_pca = model_pca.predict(X_test_pca)
print("RMSE :",np.sqrt(mean_squared_error(y_test, y_pred_pca)))
mean_absolute_error(y_test, y_pred_pca)

Best GridSearch params: KNeighborsRegressor(n_neighbors=20, weights='distance')
RMSE : 2.9643870117890865


2.2754979607870673

### 2. Write a function that will indicate if an inputted IPv4 address is accurate or not. IP addresses are valid if they have 4 values between 0 and 255 (inclusive), punctuated by periods.

    Input 1:
    2.33.245.5

    Output 1:
    True

    Input 2:
    12.345.67.89

    Output 2:
    False

In [7]:
def IPv4_address(address):
    """Checking if the IP address is valid. First cheching if the address has 4 numeric parts seperated by period. 
    Then checking for address validity to check if each numeric part is less than or equal to 255 """
    
    if address.count(".") != 3:
        return False
    elif address == "":
        return False
    else:
        split_address = address.split(".")
    
    true_counter = 0
    
    for k in split_address:
        if k.isnumeric() == False:
            return False
        elif int(k) > 255:
            return False
        else:
            true_counter = true_counter + 1
                   
    if true_counter == 4:
        return True        

In [8]:
IPv4_address('12.256.67.89')

False

In [9]:
IPv4_address('2.66.245.5')

True