<a href="https://colab.research.google.com/github/Ruhan-Saad-Dave/My-Projects/blob/main/MyTemplate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data preprocessing

##Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScalar
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

##Importing dataset

In [None]:
df_csv = pd.read_csv('Data.csv')
df_excel = pd.read_excel('Data.xlsx')
df_json = pd.read_json('Data.json')
df_txt = pd.read_csv('Data.txt', sep = '\t')
df_hdf = pd.read_hdf('Data.h5', key = 'data')    #HDF% (Hierarchical Data Format version 5)
#No method for XML (eXtensible Markup Language) file
df_parquet = pd.read_parquet('Data.parquet')
#SQLite Database
'''
import sqlite3
conn = sqlite3.connect('database.db)
'''
df_sql = pd.read_sql_query('SELECT * FROM table_name', conn)
#No method for big query

##Data Information

In [None]:
df.head()
df.tail()
df.info()
df.describe()
df.describe(include = 'object')
print(df.shape)    #Rows and Columns count
print(df.columns)
print(df.index)
print(df.dtypes)
df['column_name'].value_counts()    #Count unique values in column
df.isnull().sum()

##Check for Null value

In [None]:
df.isnull()
df.notnull()
df.isna()
df.notna()
df.isnull().sum()    #Sum of null values in each column
df.isnull().any()    #Check if any null values exist
df.isnull().all()    #Check if all are null
df.isnull().sum().value_counts()    #Count number of null values in each column

##Replace Null values

In [None]:
df.fillna(value)    #Scalar, dictionary, Series or DataFrame
df.replace(to_replace = np.nan, value)    #Replace specific values
df.interpolate(method = 'linear')    #method = (linear, quadratic, nearest)
df.bfill()    #Backward fill: replace null with next non null
df.ffill()    #Forward fill: replace null with previous non null
df.dropna(axis = 0)    #Drop rows with null
df.dropna(axis = 1)    #Drop columns with null

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'mean')    #also for median, mode or constant
df = imputer.fit_transform(df)
#or
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns = df.columns)

df['column_name'].fillna(df['column_name'].mean())    #or .median or .mode()[0]

def custom_replace(value):
  if pd.isnull(value):
    return custom_value
  else:
    return value
df['column_name'].apply(custom_replace)

##Graph representation

In [None]:
#matplotlib
plt.hist(data, bins = 10)    #Histogram
plt.bar(categories, values)    #Bar plot
plt.plot(x, y)    #Line plot
plt.scatter(x,y)    #Scatter plot
plt.boxplot(data)    #Box plot
plt.pie(values, labels = categories)    #Pie charts
plt.fill_between(x,y1,y2)    #Area plot

#seaborn
sns.heatmap(data)    #Heatmaps
sns.pairplot(df)    #Pair plot
sns.violinplot(x = 'category', y = 'value', data = df)    #Combining box plots and kernel density plot
sns.jointplot(x = 'x', y = 'y', data = df)

#example
plt.figure(figsize = (8,6))
sns.histplot(df_imputed['numerical_columns'], bins = 20, kde = True, color = 'skyblue')
plt.title('Title')
plt.xlabel('X-axis')
plt.ylabel('Y-axis')
plt.show()

##Encoding Labels

In [None]:
#Ordinal Encoding
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
encoded_data = encoder.fit_transform(data)

#ON Hot Encoding
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(data)

#Label Encoding
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

#Frequency Encoding
frequency_map = data['column_name'].value_counts().to_dict()
data['column_name'] = data['column_name'].map(frequency_map)

#Target Encoding
mean_map = data.groupby('column_name')['target_variable'].mean().to_dict()
data['column_name'] = data['column_name'].map(mean_map)

#Binary Encoding
import category_encoders as ce
encoder = ce.BinaryEncoder(cols=['column_name'])
encoded_data = encoder.fit_transform(data)

#Hashing Encoding
import category_encoders as ce
encoder = ce.HashingEncoder(cols=['column_name'])
encoded_data = encoder.fit_transform(data)

#Dummy Encoding
pd.get_dummies(data, drop_first=True)

##Spliting Training and Test Set

In [None]:
#Holdout method
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#K-Fold Cross-Validation
from sklearn.model_selection import KFold
kf = KFold(n_splits=k)
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

#Stratified K-Fold Cross-Validation
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=k)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

#Leave-One-Out (LOO) Cross-Validation
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

#Time Series Split
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=k)
for train_index, test_index in tscv.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

#Custom Splitting
# Example of custom splitting logic
train_indices = ...
test_indices = ...
X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]

##Feature Scaling

In [None]:
#Standardizatin (Z-score normalization)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

#Min-Max Scaling(Normalization)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)

#Robust Scaling
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaled_data = scaler.fit_transform(data)

#Max Abs Scaler
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
scaled_data = scaler.fit_transform(data)

#Power Transformation
from sklearn.preprocessing import PowerTransformer
scaler = PowerTransformer(method='yeo-johnson')
scaled_data = scaler.fit_transform(data)

#Quantile Transformation
from sklearn.preprocessing import QuantileTransformer
scaler = QuantileTransformer(output_distribution='uniform')
scaled_data = scaler.fit_transform(data)

#Unit Vector Scaling
from sklearn.preprocessing import Normalizer
scaler = Normalizer()
scaled_data = scaler.fit_transform(data)

#Log Transformation
import numpy as np
scaled_data = np.log1p(data)

#Notes

##Numpy

##Pandas

##Matplotlib

##Seaborn

##Scikit-learn

In [None]:
from sklearn.