In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pickle
import os

In [2]:
data = pd.read_csv('data_modified.csv')

In [3]:
data.head()

Unnamed: 0,Age,Education_Level,Occupation,Number_of_Dependents,Location,Work_Experience,Marital_Status,Employment_Status,Household_Size,Homeownership_Status,Type_of_Housing,Gender,Primary_Mode_of_Transportation,Income,Age_Group,Living_Standards,Cluster
0,56,Master's,Technology,5,Urban,21,Married,Full-time,7,Own,Apartment,Male,Public transit,72510,46-60,Medium,2
1,69,High School,Finance,0,Urban,4,Single,Full-time,7,Own,Apartment,Male,Biking,75462,61-75,Medium,2
2,46,Bachelor's,Technology,1,Urban,1,Single,Full-time,7,Own,Single-family home,Female,Car,71748,46-60,Medium,2
3,32,High School,Others,2,Urban,32,Married,Full-time,1,Own,Apartment,Female,Car,74520,31-45,Medium,1
4,60,Bachelor's,Finance,3,Urban,15,Married,Self-employed,4,Own,Townhouse,Male,Walking,640210,46-60,High,2


In [4]:
object_columns = data.select_dtypes(include='object')

In [5]:
for column in object_columns.columns:
    print(f"Unique values in {column}: {object_columns[column].unique()}")

Unique values in Education_Level: ["Master's" 'High School' "Bachelor's" 'Doctorate']
Unique values in Occupation: ['Technology' 'Finance' 'Others' 'Education' 'Healthcare']
Unique values in Location: ['Urban' 'Rural' 'Suburban']
Unique values in Marital_Status: ['Married' 'Single' 'Divorced']
Unique values in Employment_Status: ['Full-time' 'Self-employed' 'Part-time']
Unique values in Homeownership_Status: ['Own' 'Rent']
Unique values in Type_of_Housing: ['Apartment' 'Single-family home' 'Townhouse']
Unique values in Gender: ['Male' 'Female']
Unique values in Primary_Mode_of_Transportation: ['Public transit' 'Biking' 'Car' 'Walking']
Unique values in Age_Group: ['46-60' '61-75' '31-45' '15-30']
Unique values in Living_Standards: ['Medium' 'High' 'Low']


In [6]:
features = ['Age', 'Work_Experience', 'Household_Size', 'Income']

In [7]:
sample = data[features]

In [8]:
sample.head()

Unnamed: 0,Age,Work_Experience,Household_Size,Income
0,56,21,7,72510
1,69,4,7,75462
2,46,1,7,71748
3,32,32,1,74520
4,60,15,4,640210


In [9]:
sample.head()

Unnamed: 0,Age,Work_Experience,Household_Size,Income
0,56,21,7,72510
1,69,4,7,75462
2,46,1,7,71748
3,32,32,1,74520
4,60,15,4,640210


In [10]:
# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(sample)

# Train the KMeans model
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(X_scaled)

In [11]:
# Create the artifacts directory if it doesn't exist
os.makedirs('artifacts', exist_ok=True)


In [15]:
# Save the KMeans model and the scaler using pickle
with open('kmeans_model.pkl', 'wb') as model_file:
    pickle.dump(kmeans, model_file)

In [17]:
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

In [12]:
#sample_random_subset.to_csv('kmeans.csv')


In [18]:
print("Model and scaler have been saved successfully.")

Model and scaler have been saved successfully.


In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import AdaBoostRegressor

In [25]:
# Load the data
data = pd.read_csv('data_modified.csv')

In [26]:
# Assume 'target' is the name of the target variable
X = data.drop(columns='Income')
y = data['Income']

# Define column types
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6112024)

# Initialize models
models = {'AdaBoost': AdaBoostRegressor(random_state=6112024)}
param_grids = {'AdaBoost': {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]}}

In [30]:
# Function to evaluate models
def evaluate_models(X_train, y_train, X_test, y_test, models, param):
    results = {}
    for model_name in models:
        print(f"Evaluating {model_name}")
        
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', models[model_name])])
        
        grid_search = GridSearchCV(pipeline, param_grid=param[model_name], cv=5, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        results[model_name] = {
            'best_params': grid_search.best_params_,
            'mean_squared_error': mse,
            'r2_score': r2
        }
        
    return results

# Evaluate models and get a report of their performance
model_report = evaluate_models(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, models=models, param=param_grids)

Evaluating AdaBoost


ValueError: Invalid parameter 'learning_rate' for estimator Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Age',
                                                   'Number_of_Dependents',
                                                   'Work_Experience',
                                                   'Household_Size',
                                                   'Cluster']),
                                                 ('cat', OneHotEncoder(),
                                                  ['Education_Level',
                                                   'Occupation', 'Location',
                                                   'Marital_Status',
                                                   'Employment_Status',
                                                   'Homeownership_Status',
                                                   'Type_of_Housing', 'Gender',
                                                   'Primary_Mode_of_Transportation',
                                                   'Age_Group',
                                                   'Living_Standards'])])),
                ('regressor', AdaBoostRegressor(random_state=6112024))]). Valid parameters are: ['memory', 'steps', 'verbose'].

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
import pickle

# Load the data
data = pd.read_csv('data_modified.csv')
print(data.dtypes)
# Assume 'target' is the name of the target variable
X = data.drop(columns='Income')
y = data['Income']

# Define column types
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6112024)

# Initialize models
models = {'Random Forest': RandomForestRegressor(random_state=6112024)}
param_grids = {'Random Forest': {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}}

# Function to evaluate models
def evaluate_models(X_train, y_train, X_test, y_test, models, param):
    results = {}
    best_model = None
    for model_name in models:
        print(f"Evaluating {model_name}")
        
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', models[model_name])])
        
        grid_search = GridSearchCV(pipeline, param_grid=param[model_name], cv=5, scoring='neg_mean_squared_error')
        grid_search.fit(X_train, y_train)
        
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        results[model_name] = {
            'best_params': grid_search.best_params_,
            'mean_squared_error': mse,
            'r2_score': r2
        }
    
    return results, best_model

# Evaluate models and get a report of their performance
model_report, best_model = evaluate_models(X_train=X_train, y_train=y_train, X_test=X_test, y_test=y_test, models=models, param=param_grids)


Age                                int64
Education_Level                   object
Occupation                        object
Number_of_Dependents               int64
Location                          object
Work_Experience                    int64
Marital_Status                    object
Employment_Status                 object
Household_Size                     int64
Homeownership_Status              object
Type_of_Housing                   object
Gender                            object
Primary_Mode_of_Transportation    object
Income                             int64
Age_Group                         object
Living_Standards                  object
Cluster                            int64
dtype: object
Evaluating AdaBoost


In [36]:
model_report

{'AdaBoost': {'best_params': {'regressor__learning_rate': 0.01,
   'regressor__n_estimators': 50},
  'mean_squared_error': 495012970976.5116,
  'r2_score': 0.845688190740403}}

In [37]:
# Save the best model to a pickle file
with open('model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [41]:
# Define the categorical and numerical columns
categorical_columns = ["Age_Group", "Primary_Mode_of_Transportation", "Education_Level", "Occupation", "Marital_Status", "Living_Standards", "Gender", "Homeownership_Status", "Location", "Type_of_Housing", "Employment_Status"]
numerical_columns = ["Work_Experience", "Number_of_Dependents", "Household_Size", "Age", "Cluster"]


# Create pipelines for numerical and categorical data processing
num_pipeline = Pipeline(steps=[("scaler", StandardScaler())])
cat_pipeline = Pipeline(steps=[("one_hot_encoder", OneHotEncoder()), ("scaler", StandardScaler(with_mean=False))])

# Combine the pipelines into a single preprocessor
preprocessor = ColumnTransformer([
    ("num_pipeline", num_pipeline, numerical_columns),
    ("cat_pipeline", cat_pipeline, categorical_columns)
])

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6112024)

# Fit the preprocessor on the training data
preprocessor.fit(X_train)

In [43]:

# Save the preprocessor to a pickle file
with open('preprocessor.pkl', 'wb') as file:
    pickle.dump(preprocessor, file)

In [44]:
print("Preprocessor has been saved to 'preprocessor.pkl'.")

Preprocessor has been saved to 'preprocessor.pkl'.


In [55]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pickle

data=pd.read_csv('data_modified.csv')
print(data.dtypes)
# Identify categorical and numerical columns
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove('Income')  # Remove target variable from numerical columns

# Preprocessing for categorical data: One-hot encoding
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for numerical data: Standardization
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = RandomForestRegressor(random_state=42)

# Create and train the complete pipeline
full_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('model', model)])

# Split data
X = data.drop('Income', axis=1)
y = data['Income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
full_pipeline.fit(X_train, y_train)

# Save the preprocessor and the model in two separate pickle files
with open('preprocessor_new.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)
    
"""with open('/mnt/data/model.pkl', 'wb') as f:
    pickle.dump(full_pipeline.named_steps['model'], f)
"""
"/mnt/data/preprocessor.pkl", "/mnt/data/model.pkl"


Age                                int64
Education_Level                   object
Occupation                        object
Number_of_Dependents               int64
Location                          object
Work_Experience                    int64
Marital_Status                    object
Employment_Status                 object
Household_Size                     int64
Homeownership_Status              object
Type_of_Housing                   object
Gender                            object
Primary_Mode_of_Transportation    object
Income                             int64
Age_Group                         object
Living_Standards                  object
Cluster                            int64
dtype: object


('/mnt/data/preprocessor.pkl', '/mnt/data/model.pkl')

In [56]:
# Load the preprocessor and test it
with open('preprocessor_new.pkl', 'rb') as f:
    loaded_preprocessor = pickle.load(f)


transformed_sample = loaded_preprocessor.transform(X_test)

print(transformed_sample)

[[ 0.51963248  0.2712634   0.90221572 ...  0.          0.
   1.        ]
 [-1.32347356  0.2712634  -0.0551421  ...  1.          0.
   0.        ]
 [ 1.70448637 -0.89108646 -1.28603072 ...  0.          1.
   0.        ]
 ...
 [-0.6652214  -1.4722614  -0.0551421  ...  0.          0.
   1.        ]
 [-1.52094921 -1.4722614  -0.46543831 ...  0.          0.
   1.        ]
 [-1.25764835  0.2712634  -0.46543831 ...  0.          0.
   1.        ]]


In [57]:
# Print transformed sample and get its size and shape
transformed_sample_size = transformed_sample.size
transformed_sample_shape = transformed_sample.shape

transformed_sample, transformed_sample_size, transformed_sample_shape

(array([[ 0.51963248,  0.2712634 ,  0.90221572, ...,  0.        ,
          0.        ,  1.        ],
        [-1.32347356,  0.2712634 , -0.0551421 , ...,  1.        ,
          0.        ,  0.        ],
        [ 1.70448637, -0.89108646, -1.28603072, ...,  0.        ,
          1.        ,  0.        ],
        ...,
        [-0.6652214 , -1.4722614 , -0.0551421 , ...,  0.        ,
          0.        ,  1.        ],
        [-1.52094921, -1.4722614 , -0.46543831, ...,  0.        ,
          0.        ,  1.        ],
        [-1.25764835,  0.2712634 , -0.46543831, ...,  0.        ,
          0.        ,  1.        ]]),
 82000,
 (2000, 41))

In [58]:
# Identify categorical and numerical columns
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols.remove('Income')  # Remove target variable from numerical columns

# Display number of unique values in each categorical column
unique_values_per_cat_col = {col: data[col].nunique() for col in categorical_cols}
print("Unique values per categorical column:", unique_values_per_cat_col)

# Total number of columns after one-hot encoding
total_new_columns = sum(unique_values_per_cat_col.values())
print("Total new columns due to one-hot encoding:", total_new_columns)

# Original numerical columns count
num_numerical_cols = len(numerical_cols)
print("Number of numerical columns:", num_numerical_cols)

# Total columns after preprocessing
total_columns_after_preprocessing = total_new_columns + num_numerical_cols
print("Total columns after preprocessing:", total_columns_after_preprocessing)

Unique values per categorical column: {'Education_Level': 4, 'Occupation': 5, 'Location': 3, 'Marital_Status': 3, 'Employment_Status': 3, 'Homeownership_Status': 2, 'Type_of_Housing': 3, 'Gender': 2, 'Primary_Mode_of_Transportation': 4, 'Age_Group': 4, 'Living_Standards': 3}
Total new columns due to one-hot encoding: 36
Number of numerical columns: 5
Total columns after preprocessing: 41
