In [None]:
pip install ISLP

In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance
from sklearn.ensemble import RandomForestClassifier


In [2]:
# Get the current working directory
current_directory = os.getcwd()
print("Current Directory:", current_directory)

Current Directory: /Users/saluwaumuhoza/Downloads


In [3]:

# Specify the path to the CSV file
file_path = "/Users/saluwaumuhoza/Downloads/Housing.csv"

    # Read the CSV file into a DataFrame
df = pd.read_csv(file_path)
   

In [4]:
df.head()

Unnamed: 0,SERIAL,DENSITY,OWNERSHP,OWNERSHPD,COSTELEC,COSTGAS,COSTWATR,COSTFUEL,HHINCOME,ROOMS,...,NFAMS,NCOUPLES,PERNUM,PERWT,AGE,MARST,BIRTHYR,EDUC,EDUCD,INCTOT
0,1371772,920.0,1,13,9990,9993,360,9993,75000,7,...,1,0,1,14,52,6,1969,7,71,75000
1,1371773,3640.9,2,22,1080,9993,1800,9993,13600,6,...,2,0,1,83,22,6,1999,10,101,5600
2,1371773,3640.9,2,22,1080,9993,1800,9993,13600,6,...,2,0,2,106,22,6,1999,7,71,8000
3,1371774,22.5,1,13,600,9993,9993,9993,7000,5,...,1,0,1,33,62,4,1959,6,63,7000
4,1371775,3710.4,2,22,3600,9993,9997,9993,50500,4,...,1,0,1,297,50,3,1971,7,71,16000


In [5]:
# Explore the dataset

print(df.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75388 entries, 0 to 75387
Data columns (total 23 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   SERIAL     75388 non-null  int64  
 1   DENSITY    75388 non-null  float64
 2   OWNERSHP   75388 non-null  int64  
 3   OWNERSHPD  75388 non-null  int64  
 4   COSTELEC   75388 non-null  int64  
 5   COSTGAS    75388 non-null  int64  
 6   COSTWATR   75388 non-null  int64  
 7   COSTFUEL   75388 non-null  int64  
 8   HHINCOME   75388 non-null  int64  
 9   ROOMS      75388 non-null  int64  
 10  BUILTYR2   75388 non-null  int64  
 11  BEDROOMS   75388 non-null  int64  
 12  VEHICLES   75388 non-null  int64  
 13  NFAMS      75388 non-null  int64  
 14  NCOUPLES   75388 non-null  int64  
 15  PERNUM     75388 non-null  int64  
 16  PERWT      75388 non-null  int64  
 17  AGE        75388 non-null  int64  
 18  MARST      75388 non-null  int64  
 19  BIRTHYR    75388 non-null  int64  
 20  EDUC  

SERIAL: Unique identifier for each observation.
DENSITY: Population density of the surrounding area (float64).
OWNERSHP: Indicator variable for ownership status (1 for owner-occupied, 0 for renter-occupied).
OWNERSHPD: Additional information about ownership status.
COSTELEC: Cost of electricity.
COSTGAS: Cost of gas.
COSTWATR: Cost of water.
COSTFUEL: Cost of other fuels.
HHINCOME: Household income.
ROOMS: Number of rooms in the dwelling.
BUILTYR2: Year of construction of the dwelling.
BEDROOMS: Number of bedrooms in the dwelling.
VEHICLES: Number of vehicles owned by the household.
NFAMS: Number of families in the household.
NCOUPLES: Number of married couples in the household.
PERNUM: Person number within the household.
PERWT: Person weight.
AGE: Age of the individual.
MARST: Marital status.
BIRTHYR: Year of birth.
EDUC: Education level.
EDUCD: Detailed education level.
INCTOT: Total income.

In [6]:
# Identify categorical variables
categorical_columns = ['MARST', 'EDUC']  # Include other categorical columns as needed

# Convert categorical variables into dummy variables
df_encoded = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Identify columns to standardize
continuous_columns = ['DENSITY', 'COSTELEC', 'HHINCOME', 'ROOMS', 'AGE', 'INCTOT']

# Standardize continuous variables
scaler = StandardScaler()
df_encoded[continuous_columns] = scaler.fit_transform(df_encoded[continuous_columns])

# Define the target and predictor variables
target = 'OWNERSHP'
predictors = ['DENSITY', 'COSTELEC', 'HHINCOME', 'ROOMS', 'AGE'] + list(df_encoded.columns.difference([target]))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df_encoded[predictors], df_encoded[target], test_size=0.3, random_state=42
)


In [None]:
# Linear kernel
svc_linear = SVC(kernel='linear', C=1)
svc_linear.fit(X_train, y_train)

# Radial kernel (RBF)
svc_radial = SVC(kernel='rbf', C=1, gamma='auto')
svc_radial.fit(X_train, y_train)

# Polynomial kernel
svc_poly = SVC(kernel='poly', C=1, degree=3)
svc_poly.fit(X_train, y_train)


In [None]:
# Function to evaluate a model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    
    return accuracy, conf_matrix, class_report

# Evaluate each model
models = {'Linear SVM': svc_linear, 'Radial SVM': svc_radial, 'Polynomial SVM': svc_poly}

for model_name, model in models.items():
    accuracy, conf_matrix, class_report = evaluate_model(model, X_test, y_test)
    print(f"Results for {model_name}:")
    print(f"Accuracy: {accuracy}")
    print("Confusion Matrix:")
    print(conf_matrix)
    print("Classification Report:")
    print(class_report)
    print("\n")
