In [1]:

import numpy as np
import os
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
#from gain import GAIN
from sklearn.neural_network import MLPRegressor
from sklearn.experimental import enable_iterative_imputer
# from missingpy import MissForest
from scipy.stats.mstats import winsorize
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN
from scipy import stats
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, QuantileTransformer, MaxAbsScaler, Normalizer, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder, HelmertEncoder, SumEncoder, BackwardDifferenceEncoder, LeaveOneOutEncoder, JamesSteinEncoder, BinaryEncoder

import sys
import dvc.api
import json


In [3]:

model_type=""
missing_option=""
categorical_option=""
scaling_option=""

with open('D:/Study/INTERNSHIP/FINAL/params.json','r') as f:
    di=json.load(f)
    model_type=di['model_type']
    missing_option=di['null_values']
    categorical_option=di['encoding']
    scaling_option=di['scaling']
    
def get_categorical_columns(dataset):
    # Assuming categorical columns have 'object' data type, you can adjust the condition based on your dataset
    print(dataset.info())
    return list(dataset.select_dtypes(include=['object']).columns)

def transform(dataset, missing_option, categorical_option, scaling_option):
    print(missing_option, categorical_option, scaling_option)
    # Handling missing values
    '''
    print("\nHandling Missing Values:")
    print("1. Mean Imputation")
    print("2. Median Imputation")
    print("3. Custom Value Imputation")
    print("4. Most Frequent Imputation")
    print("5. KNN Imputation")
    print("6. Linear Regression Imputation")
    print("7. Iterative Imputation")
    print("8. Multiple Imputation by Chained Equations (MICE)")
    print("9 Autoencoder Imputation")
    '''
    missing_columns = dataset.columns[dataset.isnull().any()].tolist()
    
    if missing_columns:
        
        if missing_option == "Mean":
            imputer = SimpleImputer(strategy='mean')
        elif missing_option == "Median":
            imputer = SimpleImputer(strategy='median')
        elif missing_option == "Mode":
            imputer = SimpleImputer(strategy='most_frequent')
        elif missing_option == "Linear Regression Imputation":
            imputer = IterativeImputer(max_iter=10, random_state=0)
        
        dataset[missing_columns] = imputer.fit_transform(dataset[missing_columns])
    
    # Encoding categorical data
    '''print("\nEncoding Categorical Data:")
    print("1. One-Hot Encoding")
    print("2. Label Encoding")
    print("3. Target Encoding")
    print("4. Helmert Coding")
    print("5. Sum Coding")
    print("6. Backward Difference Coding")
    print("7. Leave-One-Out Encoding")
    print("8. James-Stein Encoder")
    print("9. Binary Encoding")'''
    
    categorical_columns = get_categorical_columns(dataset)
    if categorical_option=='None':
        print("None")

    elif categorical_columns and categorical_option == 'One Hot Encoding':
        encoder = OneHotEncoder()
        print(dataset.iloc[0])
        dataset = pd.DataFrame(encoder.fit_transform(dataset[categorical_columns]).toarray(), columns=encoder.get_feature_names_out(categorical_columns))
        
    elif categorical_columns and categorical_option == 'Label Encoding':
        dataset[categorical_columns] = dataset[categorical_columns].astype('category')
        for column in categorical_columns:
            dataset[column] = dataset[column].cat.codes
       
    elif categorical_columns and categorical_option == 'Helmert Encoding':
        encoder = HelmertEncoder(cols=categorical_columns)
        dataset[categorical_columns] = encoder.fit_transform(dataset[categorical_columns])
    
    elif categorical_columns and categorical_option == 'Sum Encoding':
        encoder = SumEncoder(cols=categorical_columns)
        dataset[categorical_columns] = encoder.fit_transform(dataset[categorical_columns])
    
    elif categorical_columns and categorical_option == 'Backward Difference Encoding':
        encoder = BackwardDifferenceEncoder(cols=categorical_columns)
        dataset[categorical_columns] = encoder.fit_transform(dataset[categorical_columns])
         
    elif categorical_columns and categorical_option == 'James-Stein Encoder':
        encoder = JamesSteinEncoder(cols=categorical_columns)
        dataset[categorical_columns] = encoder.fit_transform(dataset[categorical_columns])
    
    else:
        raise ValueError("Invalid input for encoding categorical data!!")
    
    # Feature scaling
    '''print("\nFeature Scaling:")
    print("1. Standard Scaling (Z-score)")
    print("2. Min-Max Scaling")
    print("3. Robust Scaling")
    print("4. Power Transformation (Yeo-Johnson)")
    print("5. Quantile Transformation")
    print("6. MaxAbsScaler")
    print("7. Normalizer")'''
    numerical_columns = list(set(dataset.columns) - set(categorical_columns))
    print(numerical_columns)
    print(categorical_columns)
    if scaling_option == 'Standard Scaling':
        scaler = StandardScaler(with_mean=False)  # Pass with_mean=False for sparse matrices
        dataset.iloc[:, :] = scaler.fit_transform(dataset)

    elif scaling_option == 'Min-Max Scaling':
        scaler = MinMaxScaler()
        dataset.iloc[:, :] = scaler.fit_transform(dataset)

    elif scaling_option == 'Robust Scaling':
        scaler = RobustScaler()
        dataset.iloc[:, :] = scaler.fit_transform(dataset)

    elif scaling_option == 'Power Transformation':
        scaler = PowerTransformer(method='yeo-johnson', standardize=True)
        dataset.iloc[:, :] = scaler.fit_transform(dataset)

    elif scaling_option == 'Quantile Transformation':
        scaler = QuantileTransformer(output_distribution='uniform')
        dataset.iloc[:, :] = scaler.fit_transform(dataset)

    elif scaling_option == 'MaxAbsScaler':
        scaler = MaxAbsScaler()
        dataset.iloc[:, :] = scaler.fit_transform(dataset)

    elif scaling_option == 'Normalizer':
        scaler = Normalizer()
        dataset.iloc[:, :] = scaler.fit_transform(dataset)

    else:
        print(scaling_option)
        raise ValueError("Invalid input for feature scaling technique.")
    
    return dataset

def pretime(train_df):

  period = len(train_df)
  train_df['Date'] = pd.date_range('2015-01-08', periods=period, freq='W')
  train_df['Day'] = train_df['Date'].dt.day
  train_df['Month'] = train_df['Date'].dt.month
  train_df['Year'] = train_df['Date'].dt.year
  train_df['Quarter'] = train_df['Date'].dt.quarter

  return train_df


######################## Main Code Starts ################################


In [6]:
  
#dir path
raw_data_path = 'D:/Study/INTERNSHIP/FINAL/data/raw/'
processed_data_path = 'D:Study/INTERNSHIP/FINAL/data/processed/preprocessed_data.csv'

merged_df = pd.read_csv(raw_data_path + 'train1.csv')

#transforms merged_df
merged_df.drop(columns=['city_code','region_code','center_type','op_area'], inplace=True)


In [8]:
merged_df.head()

Unnamed: 0,id,week,center_id,meal_id,category,cuisine,checkout_price,base_price,emailer_for_promotion,homepage_featured,num_orders
0,"{""id"": 1379560","""week"": 1","""center_id"": 55","""meal_id"": 1885","""category"": ""Beverages""","""cuisine"": ""Thai""","""checkout_price"": 136.83","""base_price"": 152.29","""emailer_for_promotion"": 0","""homepage_featured"": 0","""num_orders"": 177}"
1,"{""id"": 1018704","""week"": 2","""center_id"": 55","""meal_id"": 1885","""category"": ""Beverages""","""cuisine"": ""Thai""","""checkout_price"": 135.83","""base_price"": 152.29","""emailer_for_promotion"": 0","""homepage_featured"": 0","""num_orders"": 323}"
2,"{""id"": 1196273","""week"": 3","""center_id"": 55","""meal_id"": 1885","""category"": ""Beverages""","""cuisine"": ""Thai""","""checkout_price"": 132.92","""base_price"": 133.92","""emailer_for_promotion"": 0","""homepage_featured"": 0","""num_orders"": 96}"
3,"{""id"": 1116527","""week"": 4","""center_id"": 55","""meal_id"": 1885","""category"": ""Beverages""","""cuisine"": ""Thai""","""checkout_price"": 135.86","""base_price"": 134.86","""emailer_for_promotion"": 0","""homepage_featured"": 0","""num_orders"": 163}"
4,"{""id"": 1343872","""week"": 5","""center_id"": 55","""meal_id"": 1885","""category"": ""Beverages""","""cuisine"": ""Thai""","""checkout_price"": 146.5","""base_price"": 147.5","""emailer_for_promotion"": 0","""homepage_featured"": 0","""num_orders"": 215}"


In [2]:
import json

In [3]:
with open('D:/Study/INTERNSHIP/FINAL/params.json','r') as f:
    di=json.load(f)
    print(di)
    topic=di['kafka_topic']
    bootstrap_servers=di['kafka_url']

{'kafka_topic': 'finald', 'kafka_url': 'localhost:9092', 'model_type': 'Regression Models', 'null_values': 'Mean', 'encoding': 'None', 'scaling': 'Standard Scaling', 'model_id': 'Lasso Regression', 'tuning': 'No Tuning', 'db_type': 'sqlite', 'db_name': 'Temp', 'db_url': 'Something'}


In [4]:
topic

'finald'