In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Evaluation metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [None]:
# Load data
data = pd.read_csv('your_data.csv')

# Preview the data
data.head()


In [None]:
# Handling missing values

# Impute numerical columns with the median value
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
imputer = SimpleImputer(strategy='median')
data[numerical_cols] = imputer.fit_transform(data[numerical_cols])

# Impute categorical columns with a placeholder value ('Unknown')
categorical_cols = data.select_dtypes(include=['object']).columns
imputer_cat = SimpleImputer(strategy='constant', fill_value='Unknown')
data[categorical_cols] = imputer_cat.fit_transform(data[categorical_cols])

# Drop duplicate rows (if applicable)
data = data.drop_duplicates()

# Optional: Handle outliers (if needed)
# Example: Removing rows where 'feature' is outside the acceptable range
# data = data[data['feature'] < threshold_value]


In [None]:
# Feature Engineering

# Extract datetime features (if you have a timestamp column)
data['created_at'] = pd.to_datetime(data['created_at'])
data['hour'] = data['created_at'].dt.hour
data['dayofweek'] = data['created_at'].dt.dayofweek
data['month'] = data['created_at'].dt.month

# Convert categorical columns using one-hot encoding if necessary
data = pd.get_dummies(data, drop_first=True)

# Example: Create an interaction term between two features
data['total_item_value'] = data['total_items'] * data['subtotal']


In [None]:
# Feature scaling (Standardization)
scaler = StandardScaler()
numerical_cols_scaled = data.select_dtypes(include=['float64', 'int64']).columns
data[numerical_cols_scaled] = scaler.fit_transform(data[numerical_cols_scaled])


In [None]:
# Define the target and feature variables
X = data.drop(columns=['target_column', 'created_at'])  # drop target column and any non-predictor features
y = data['target_column']  # The target variable you want to predict

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Linear

In [None]:
# Initialize and train the model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Predicting on the test set
y_pred_lr = lr_model.predict(X_test)

# Evaluate model
print(f'Linear Regression RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_lr))}')
print(f'Linear Regression R2 Score: {r2_score(y_test, y_pred_lr)}')


### RF

In [None]:
# Initialize and train the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predicting on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate model
print(f'Random Forest RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_rf))}')
print(f'Random Forest R2 Score: {r2_score(y_test, y_pred_rf)}')


In [5]:
# Hypertuning
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import randint

# Define the model
rf_model = RandomForestRegressor(random_state=42)

# Define the hyperparameter distribution
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4),
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, 
                                   n_iter=100, cv=5, n_jobs=-1, verbose=2, 
                                   scoring='neg_mean_squared_error', random_state=42)

# Fit the model to the data
random_search.fit(X_train, y_train)

# Print the best parameters and best score
print(f"Best parameters for RF (RandomizedSearchCV): {random_search.best_params_}")
print(f"Best score (neg MSE) for RF: {random_search.best_score_}")


NameError: name 'X_train' is not defined

### Cross Validation RF

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

# Define the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Calculate RMSE from cross-validation scores
rmse_scores = np.sqrt(-cv_scores)
print(f'Random Forest Cross-Validation RMSE: {rmse_scores}')
print(f'Mean RMSE: {rmse_scores.mean()}')


### Decision T

In [None]:
# Initialize and train the model
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Predicting on the test set
y_pred_dt = dt_model.predict(X_test)

# Evaluate model
print(f'Decision Tree RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_dt))}')
print(f'Decision Tree R2 Score: {r2_score(y_test, y_pred_dt)}')


### Support Vector Regressor

In [None]:
# Initialize and train the model
svr_model = SVR()
svr_model.fit(X_train, y_train)

# Predicting on the test set
y_pred_svr = svr_model.predict(X_test)

# Evaluate model
print(f'SVR RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_svr))}')
print(f'SVR R2 Score: {r2_score(y_test, y_pred_svr)}')


### KNN

In [None]:
# Initialize and train the model
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, y_train)

# Predicting on the test set
y_pred_knn = knn_model.predict(X_test)

# Evaluate model
print(f'KNN RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_knn))}')
print(f'KNN R2 Score: {r2_score(y_test, y_pred_knn)}')


In [None]:
# RMSE (Root Mean Squared Error)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'RMSE: {rmse}')

# MAE (Mean Absolute Error)
mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae}')

# R2 Score
r2 = r2_score(y_test, y_pred)
print(f'R2 Score: {r2}')


#### Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor

# Define the model
knn_model = KNeighborsRegressor(n_neighbors=5)

# Perform 5-fold cross-validation
cv_scores_knn = cross_val_score(knn_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

# Calculate RMSE from cross-validation scores
rmse_scores_knn = np.sqrt(-cv_scores_knn)
print(f'KNN Cross-Validation RMSE: {rmse_scores_knn}')
print(f'Mean RMSE for KNN: {rmse_scores_knn.mean()}')


## tuning

In [None]:
from sklearn.model_selection import GridSearchCV

# Example: Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None]
}

grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters
print("Best Parameters:", grid_search.best_params_)


## Deploy

In [None]:
import joblib

# Save the trained model to a file
joblib.dump(rf_model, 'random_forest_model.pkl')

# Load the model back
loaded_model = joblib.load('random_forest_model.pkl')

# Make predictions with the loaded model
predictions = loaded_model.predict(X_test)


In [2]:
## Table operations::

In [3]:
import pandas as pd

# Load data from a CSV file
data = pd.read_csv('your_file.csv')

# Load data from an Excel file
data = pd.read_excel('your_file.xlsx')

# Load data from a SQL database
import sqlite3
conn = sqlite3.connect('your_database.db')
data = pd.read_sql_query("SELECT * FROM your_table", conn)


FileNotFoundError: [Errno 2] No such file or directory: 'your_file.csv'

In [None]:
# View the first few rows of the dataframe
data.head()  # By default, shows first 5 rows

# View the last few rows of the dataframe
data.tail()  # By default, shows last 5 rows

# Get the data types of each column
data.dtypes

# Get a summary of the dataframe (e.g., number of rows, columns, etc.)
data.info()

# Get basic statistics for numerical columns
data.describe()

# Check for missing values in each column
data.isnull().sum()

# Check the unique values in a specific column
data['column_name'].unique()

# Get the number of unique values in each column
data.nunique()


In [None]:
# Select a single column
data['column_name']  # Or data.column_name

# Select multiple columns
data[['column1', 'column2']]

# Select rows based on conditions
data[data['column_name'] > value]

# Select rows by index position
data.iloc[0:10]  # Select rows by position (from 0 to 10)

# Select rows by label (index)
data.loc['row_label']  # Use index label


In [None]:
# Rename columns
data.rename(columns={'old_name': 'new_name'}, inplace=True)

# Create a new column
data['new_column'] = data['column1'] + data['column2']

# Drop columns
data.drop(columns=['column_name'], inplace=True)

# Drop rows with missing values
data.dropna(inplace=True)
data['actual_delivery_time'] = pd.to_datetime(data['actual_delivery_time'], errors='coerce') ### will stop raising error instead fill with NaT
# Fill missing values with a specific value
data.fillna(value='default_value', inplace=True)

# Apply a function to a column
data['column_name'] = data['column_name'].apply(lambda x: x + 1)

# Change the type of a column
data['column_name'] = data['column_name'].astype('int')

# Filter rows based on multiple conditions
data[(data['column1'] > value1) & (data['column2'] < value2)]


In [None]:
# Sort data by one or more columns
data.sort_values(by='column_name', ascending=True, inplace=True)

# Sort data by multiple columns
data.sort_values(by=['column1', 'column2'], ascending=[True, False], inplace=True)


In [None]:
# Group data by a column and aggregate
data_grouped = data.groupby('column_name').sum()  # Sum values for each group

# Get the mean of a column, grouped by another column
data_grouped = data.groupby('column_name')['another_column'].mean()

# Get multiple statistics for each group
data_grouped = data.groupby('column_name').agg({'column1': 'sum', 'column2': 'mean'})

# Reset index after grouping (to return to the default integer index)
data_grouped.reset_index(inplace=True)


In [None]:
# Merge two DataFrames based on a common column
merged_data = pd.merge(data1, data2, on='common_column', how='inner')

# Merge with left join (keep all rows from the left dataframe)
merged_data = pd.merge(data1, data2, on='common_column', how='left')

# Merge with outer join (keep all rows from both dataframes)
merged_data = pd.merge(data1, data2, on='common_column', how='outer')


In [None]:
# Pivot data: Convert rows into columns (for summarizing data)
pivot_data = data.pivot(index='column1', columns='column2', values='column3')

# Melt data: Convert wide-format data to long format
melted_data = pd.melt(data, id_vars=['column1'], value_vars=['column2', 'column3'])


In [None]:
# Remove duplicate rows
data.drop_duplicates(inplace=True)

# Find duplicate rows based on a subset of columns
data[data.duplicated(subset=['column1', 'column2'])]


In [None]:
# Convert a column to datetime format
data['date_column'] = pd.to_datetime(data['date_column'])

# Extract the year, month, day, hour from a datetime column
data['year'] = data['date_column'].dt.year
data['month'] = data['date_column'].dt.month
data['day'] = data['date_column'].dt.day
data['hour'] = data['date_column'].dt.hour

# Filter rows by date range
data_filtered = data[(data['date_column'] >= '2021-01-01') & (data['date_column'] <= '2021-12-31')]


In [None]:
# Save the DataFrame to a CSV file
data.to_csv('output_file.csv', index=False)

# Save the DataFrame to an Excel file
data.to_excel('output_file.xlsx', index=False)

# Save the DataFrame to a SQL database
import sqlite3
conn = sqlite3.connect('your_database.db')
data.to_sql('your_table', conn, if_exists='replace', index=False)


In [None]:
# Apply a function to a specific column
data['new_column'] = data['column_name'].apply(lambda x: x * 2)

# Apply a function to each row
data['row_sum'] = data.apply(lambda row: row['column1'] + row['column2'], axis=1)
