## Data preprocessing

### Step 1:  importing important libraries

In [24]:
import numpy as  np
import  pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection  import train_test_split
from sklearn.preprocessing  import StandardScaler
from sklearn.preprocessing import OneHotEncoder


### Step 2:Handling the missing values and outliers

In [25]:
Boston_house=pd.read_csv(r"C:\Users\hp\OMDENA\machine-learning-linear-regression-1962vickyrena\BostonHousing.csv")

In [26]:
Boston_house.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [27]:
### Step 3:Checking  missing values
Boston_house.isnull().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64

### Step 4:Encoding categorical variable

In [28]:
Boston_house.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [29]:
### Step 5: converting categorical data to  binary(rad)
encoded_data=pd.get_dummies(Boston_house , columns=["rad"],dtype=int)
print(encoded_data)                     

        crim    zn  indus  chas    nox     rm   age     dis  tax  ptratio  \
0    0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900  296     15.3   
1    0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671  242     17.8   
2    0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671  242     17.8   
3    0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622  222     18.7   
4    0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622  222     18.7   
..       ...   ...    ...   ...    ...    ...   ...     ...  ...      ...   
501  0.06263   0.0  11.93     0  0.573  6.593  69.1  2.4786  273     21.0   
502  0.04527   0.0  11.93     0  0.573  6.120  76.7  2.2875  273     21.0   
503  0.06076   0.0  11.93     0  0.573  6.976  91.0  2.1675  273     21.0   
504  0.10959   0.0  11.93     0  0.573  6.794  89.3  2.3889  273     21.0   
505  0.04741   0.0  11.93     0  0.573  6.030  80.8  2.5050  273     21.0   

     ...  medv  rad_1  rad_2  rad_3  rad_4  rad_5  rad_6  rad_7  rad_8  rad

### Step 6: Normalize/standardize  numerical data

In [30]:
# Define features (X) and target variable (y)
X = encoded_data.drop(columns=['medv'])
y = encoded_data['medv']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert back to DataFrame
X = pd.DataFrame(X_scaled, columns=X.columns)

### step 7: Selecting  best features to be used

In [31]:
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

# Initialize the Linear Regression model
model = LinearRegression()

#RFE selects the top 5 most predictive features
selector = RFE(model, n_features_to_select=10)
selector.fit(X, y)

# Get selected features
selected_features = X.columns[selector.support_]
print("Best Features for Prediction:", list(selected_features))

Best Features for Prediction: ['crim', 'zn', 'nox', 'rm', 'dis', 'tax', 'ptratio', 'b', 'lstat', 'rad_24']


### Step 8:Spliting data into training and  testing

In [34]:
# Select only the chosen features from X
X_selected = X[selected_features]  


# Split into 80% training and 20% testing

X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Print shapes
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (404, 10), Test shape: (102, 10)


### step 9: Saving Dataframes into a new CSV

In [35]:
# Save the processed data
X_train.to_csv("X_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
y_test.to_csv("y_test.csv", index=False)
