In [1]:
import pandas as pd
import numpy as np

# Example DataFrame
data = {
    'total_bedrooms': [3, np.nan, 2, 3, np.nan, 1],
    'total_rooms': [7, 5, 6, 8, 5, 4],
    'population': [800, 600, 1200, 700, 650, 900]
}

housing = pd.DataFrame(data)

# Print the initial DataFrame
print("Initial DataFrame:")
print(housing)




Initial DataFrame:
   total_bedrooms  total_rooms  population
0             3.0            7         800
1             NaN            5         600
2             2.0            6        1200
3             3.0            8         700
4             NaN            5         650
5             1.0            4         900


array([ 3., nan,  2.,  1.])

In [2]:
# Option 1: Drop rows where "total_bedrooms" is missing
housing_dropna = housing.dropna(subset=["total_bedrooms"])
print("\nDataFrame after dropping rows with missing 'total_bedrooms':")
print(housing_dropna)




DataFrame after dropping rows with missing 'total_bedrooms':
   total_bedrooms  total_rooms  population
0             3.0            7         800
2             2.0            6        1200
3             3.0            8         700
5             1.0            4         900


In [3]:
# Option 2: Drop the "total_bedrooms" column entirely
housing_drop_column = housing.drop("total_bedrooms", axis=1)
print("\nDataFrame after dropping the 'total_bedrooms' column:")
print(housing_drop_column)




DataFrame after dropping the 'total_bedrooms' column:
   total_rooms  population
0            7         800
1            5         600
2            6        1200
3            8         700
4            5         650
5            4         900


In [4]:
# Option 3: Fill missing values in "total_bedrooms" with the median
median = housing["total_bedrooms"].median()  # Calculate the median
housing_fillna = housing.copy()  # Create a copy to avoid modifying the original DataFrame
housing_fillna["total_bedrooms"].fillna(median, inplace=True)
print("\nDataFrame after filling missing 'total_bedrooms' with the median:")
print(housing_fillna)


DataFrame after filling missing 'total_bedrooms' with the median:
   total_bedrooms  total_rooms  population
0             3.0            7         800
1             2.5            5         600
2             2.0            6        1200
3             3.0            8         700
4             2.5            5         650
5             1.0            4         900


In [21]:
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
# Handling Text and Categorical Attributes with different method

In [16]:
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#using second method

# this is used for both, this is sklearn libaray
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

# Sample DataFrame with numerical and categorical data
data = {
    'Age': [25, np.nan, 35, 45, np.nan],
    'Gender': ['Male', 'Female', 'Female', np.nan, 'Male']
}
df = pd.DataFrame(data)

# Impute numerical data (Age) with median
num_imputer = SimpleImputer(strategy='median')
df['Age'] = num_imputer.fit_transform(df[['Age']])

print("After imputing Age with median:")
print(df)


After imputing Age with median:
    Age  Gender
0  25.0    Male
1  35.0  Female
2  35.0  Female
3  45.0     NaN
4  35.0    Male


In [18]:
# Impute categorical data (Gender) with the most frequent value (mode)
cat_imputer = SimpleImputer(strategy='most_frequent')
df['Gender'] = cat_imputer.fit_transform(df[['Gender']]).ravel()  # Use ravel() to flatten the array

print("After imputing:")

print(df)

After imputing:
    Age  Gender
0  25.0    Male
1  35.0  Female
2  35.0  Female
3  45.0  Female
4  35.0    Male


In [None]:



# ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
# using econding method





In [22]:
# Importing necessary libraries
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder # this is used for categorical data, ordinal data, (e.g, Excellent, goods, bad)

# Load the data
data = pd.DataFrame({
    'ocean_proximity': ['NEAR BAY', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'NEAR BAY', 'NEAR OCEAN'],
    'total_bedrooms': [471, 371, 369, None, 541, 343, 507, 395, 478, 335]
})

# Display the data
print("Original Data:")
print(data)

Original Data:
  ocean_proximity  total_bedrooms
0        NEAR BAY           471.0
1          INLAND           371.0
2      NEAR OCEAN           369.0
3        NEAR BAY             NaN
4          ISLAND           541.0
5          INLAND           343.0
6      NEAR OCEAN           507.0
7        NEAR BAY           395.0
8        NEAR BAY           478.0
9      NEAR OCEAN           335.0


In [28]:
#data["ocean_proximity"].unique()


array(['NEAR BAY', 'INLAND', 'NEAR OCEAN', 'ISLAND'], dtype=object)

In [29]:
data["ocean_proximity"].value_counts()

ocean_proximity
NEAR BAY      4
NEAR OCEAN    3
INLAND        2
ISLAND        1
Name: count, dtype: int64

In [23]:
# Handling missing values
median = data["total_bedrooms"].median()
data["total_bedrooms"].fillna(median, inplace=True)

# Encode categorical variable
housing_cat = data[["ocean_proximity"]]
ordinal_encoder = OrdinalEncoder() # used for ordinal data
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)

# Add encoded data back to DataFrame
data["ocean_proximity_encoded"] = housing_cat_encoded

# Display the updated DataFrame
print("\nUpdated Data:")
print(data)


Updated Data:
  ocean_proximity  total_bedrooms  ocean_proximity_encoded
0        NEAR BAY           471.0                      2.0
1          INLAND           371.0                      0.0
2      NEAR OCEAN           369.0                      3.0
3        NEAR BAY           395.0                      2.0
4          ISLAND           541.0                      1.0
5          INLAND           343.0                      0.0
6      NEAR OCEAN           507.0                      3.0
7        NEAR BAY           395.0                      2.0
8        NEAR BAY           478.0                      2.0
9      NEAR OCEAN           335.0                      3.0


In [24]:



#/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////


# Importing necessary libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder # this is used for categorical data, in categorical it must be nominal data, e.g(red, green, yello)

# Load the data
data = pd.DataFrame({
    'ocean_proximity': ['NEAR BAY', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'NEAR BAY', 'NEAR OCEAN']
})

# Display the data
print("Original Data:")
print(data)

Original Data:
  ocean_proximity
0        NEAR BAY
1          INLAND
2      NEAR OCEAN
3        NEAR BAY
4          ISLAND
5          INLAND
6      NEAR OCEAN
7        NEAR BAY
8        NEAR BAY
9      NEAR OCEAN


In [25]:
# Encode categorical variable using OneHotEncoder
housing_cat = data[["ocean_proximity"]]
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

# Display the encoded data
print("\nEncoded Data:")
print(housing_cat_1hot)


Encoded Data:
  (0, 2)	1.0
  (1, 0)	1.0
  (2, 3)	1.0
  (3, 2)	1.0
  (4, 1)	1.0
  (5, 0)	1.0
  (6, 3)	1.0
  (7, 2)	1.0
  (8, 2)	1.0
  (9, 3)	1.0


In [30]:
# Feature Scaling: feature scalling is a technique to standardize the independent features present in the data in a fixed range

In [35]:
#/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

import pandas as pd

# Sample Data with two features
data1 = pd.DataFrame({
    'feature1': [10, 20, 30, 40, 1000],
    'feature2': [1, 2, 3, 4, 5]
})


In [36]:
#Min-Max Scaling
#How‐ever, Min-Max is much affected by outliers.

from sklearn.preprocessing import MinMaxScaler

# Min-Max Scaling xi=(xi-Xmin)/(xmax-Xmin)
min_max_scaler = MinMaxScaler()
data_min_max_scaled = min_max_scaler.fit_transform(data1)

data_min_max_scaled_df = pd.DataFrame(data_min_max_scaled, columns=['feature1', 'feature2'])
print("Original Data:\n", data1)
print("\nMin-Max Scaled Data:\n", data_min_max_scaled_df)


Original Data:
    feature1  feature2
0        10         1
1        20         2
2        30         3
3        40         4
4      1000         5

Min-Max Scaled Data:
    feature1  feature2
0  0.000000      0.00
1  0.010101      0.25
2  0.020202      0.50
3  0.030303      0.75
4  1.000000      1.00


In [38]:
#Standard Scaler : (xi-X)/standard devition
#How‐ever, standardization is much less affected by outliers.

from sklearn.preprocessing import StandardScaler

# Standardization
standard_scaler = StandardScaler()
data_standard_scaled = standard_scaler.fit_transform(data1)

data_standard_scaled_df = pd.DataFrame(data_standard_scaled, columns=['feature1', 'feature2'])
print("\nStandardized Data:\n", data_standard_scaled_df)



Standardized Data:
    feature1  feature2
0 -0.538285 -1.414214
1 -0.512652 -0.707107
2 -0.487019  0.000000
3 -0.461387  0.707107
4  1.999343  1.414214


In [2]:







##########################################################################################################################################################
# Now this is time to Select and Train a Model,
 #Let’s first train a Linear Regression model, 
# let's see complete example from scratch.

#Step 1: Load and Explore the Data
#First, let's create a small dataset to work with:

import pandas as pd
import numpy as np

# Sample housing data
data = {
    'median_income': [2.3442, 8.3014, 5.6431, 3.8462, 4.0368],
    'total_rooms': [5612, 7650, 720, 1501, 1454],
    'housing_median_age': [29, 42, 25, 52, 36],
    'median_house_value': [286600, 340600, 196900, 46300, 254500]
}

housing = pd.DataFrame(data)

# Separate the features and the labels
housing_labels = housing['median_house_value'].copy()
housing = housing.drop('median_house_value', axis=1)

In [3]:
#Step 2: Prepare the Data
#We need to preprocess the data before feeding it to the model. We'll use a pipeline to handle missing values and scaling.

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Define a pipeline for preprocessing
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

# Prepare the data
housing_prepared = num_pipeline.fit_transform(housing)

In [4]:
#Step 3: Train the Model
#Now let's train a Linear Regression model.

from sklearn.linear_model import LinearRegression

# Train the model
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [5]:
#Step 4: Make Predictions
#Let's make predictions on a few instances from the training set.

# Select a few instances for testing
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]

# Prepare the data using the pipeline
some_data_prepared = num_pipeline.transform(some_data)

# Make predictions
predictions = lin_reg.predict(some_data_prepared)
print("Predictions:", predictions)
print("Labels:", list(some_labels))

Predictions: [302069.28378442 334341.83150204 233916.55983588  80863.50972959
 173708.81514807]
Labels: [286600, 340600, 196900, 46300, 254500]


In [6]:
#Step 5: Evaluate the Model
#Measure the RMSE on the whole training set.

from sklearn.metrics import mean_squared_error

# Make predictions on the whole training set
housing_predictions = lin_reg.predict(housing_prepared)

# Calculate MSE and RMSE
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print("RMSE:", lin_rmse)

RMSE: 43290.97104580214


In [None]:
#Comparison and Improvement
#If the RMSE is high relative to the range of house prices, it suggests the model is not performing well. Possible improvements include:
# this time it's high
#Feature Engineering: Add new features that might help the model make better predictions.
#Model Selection: Try more complex models such as Decision Trees, Random Forests, or Gradient Boosting.
#Regularization: Apply regularization techniques to prevent overfitting.
#By following these steps, you can better understand how MSE is calculated and how to interpret and improve the performance of your machine learning model.

In [8]:
#Step 2: Train the DecisionTreeRegressor

# Train the DecisionTreeRegressor model
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

In [9]:
# Make predictions on the training set
housing_predictions = tree_reg.predict(housing_prepared)

# Calculate MSE and RMSE
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print("Decision Tree RMSE on training set:", tree_rmse)


Decision Tree RMSE on training set: 0.0
