In [126]:
#Importing all the required libraries
import pandas as pd #It makes working with structured data (like CSV files or database tables) easier. You can filter, transform, and analyze data easily using pandas.
import numpy as np #mathematical library #It helps with working on large sets of numbers efficiently, performing mathematical operations, and handling multi-dimensional arrays (like tables of numbers). 
import sklearn # It provides machine learning tools and algorithms for tasks like classification, regression, clustering, and more. It helps you build and train machine learning models.
import matplotlib.pyplot as plt #It helps create visualizations such as line plots, scatter plots, and histograms. You can use it to represent your data visually.
import seaborn as sns #It builds on top of matplotlib and makes it simpler to create statistical visualizations with better aesthetics. It's useful for creating attractive plots like heatmaps, violin plots, and more.
import warnings as wr #Ignores the warnings
wr.filterwarnings('ignore') 

In [127]:
df=pd.read_csv("supershops.csv")

In [128]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [129]:
df2=df.copy()
df3=df.copy()
df4=df.copy()
df5=df.copy()
df6=df.copy()
df7=df.copy()
df8=df.copy()
df9=df.copy()
df10=df.copy()

Normalization

In [130]:
from sklearn.preprocessing import MinMaxScaler
m=MinMaxScaler() #default feature range=(0,1) #Can be customized according to needs. EX: (2,5)

In [131]:
#If I want to apply on 'Marketing Spend', then first fit and then transform. They can be performed individually or together.

#fit
df_ms=m.fit(df[['Marketing Spend']]) #Calculate everything
df_ms

In [132]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [133]:
#transfrom and replace

df[['Marketing Spend New']] = m.transform(df[['Marketing Spend']])

In [134]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,Marketing Spend New
0,114523.61,136897.8,471784.1,Dhaka,192261.83,0.692617
1,162597.7,151377.59,443898.53,Ctg,191792.06,0.983359
2,153441.51,101145.55,407934.54,Rangpur,191050.39,0.927985
3,144372.41,118671.85,383199.62,Dhaka,182901.99,0.873136
4,142107.34,91391.77,366168.42,Rangpur,166187.94,0.859438


In [135]:
#fit and transform together

df[['Administration']] = m.fit_transform(df[['Administration']])

In [136]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,Marketing Spend New
0,114523.61,0.651744,471784.1,Dhaka,192261.83,0.692617
1,162597.7,0.761972,443898.53,Ctg,191792.06,0.983359
2,153441.51,0.379579,407934.54,Rangpur,191050.39,0.927985
3,144372.41,0.512998,383199.62,Dhaka,182901.99,0.873136
4,142107.34,0.305328,366168.42,Rangpur,166187.94,0.859438


Normalization using Loop and Conditions

In [137]:
# Assuming the data is stored in a DataFrame called 'df2'
# Extract the 'Profit' column as the target variable
target_variable = df2['Profit']

# Separate the features from the target variable
features = df2.drop('Profit', axis=1) #df2.drop('Profit', axis=1) means that we are dropping the 'Profit' column from the DataFrame 'df2', and axis=1 specifies that we want to drop the column along the columns (axis=1) and not the rows (axis=0). The remaining columns after dropping 'Profit' will be used as features for further processing.

# Initialize an empty DataFrame for normalized data
normalized_data = pd.DataFrame()

# Iterate over each column in the features DataFrame
#check if the given column is numeric or not 
# a loop is used to iterate through the columns. For each column, it checks if the data type is numeric using pd.api.types.is_numeric_dtype. If it is numeric, it applies MinMaxScaler to the numeric columns, normalizes the values, and prints the updated DataFrame. If none of the columns are numeric, it prints the message "is not numeric."
for column in features.columns: #loop that iterates over each column in the DataFrame features.
    
    # Check if the column is numeric
    if pd.to_numeric(features[column], errors='coerce').notnull().any() or features[column].eq(0).any(): #the condition is checking whether a column is numeric or contains missing values (NaN), or if it contains zeros.
    #pd.to_numeric(features[column], errors='coerce'): This converts the values in the column to numeric format. The errors='coerce' argument ensures that any non-numeric values are coerced to NaN (missing values). This allows us to check for missing values later.
    #.notnull().any(): This checks if there are any non-null values in the column after converting it to numeric format. If there are non-null values, it means the column is numeric.
    #features[column].eq(0).any(): This checks if there are any values in the column that are equal to zero. If there are any zeros, it will return True.
    #when either of these conditions is True, the column is considered numeric or contains zeros or missing values (NaN), and it will be included in the normalization process.
        
        # Apply normalization using StandardScaler
        m = MinMaxScaler()
        normalized_column = m.fit_transform(features[column].values.reshape(-1, 1)) 
        #This line uses the fit_transform() method of the MinMaxScaler object (m) to perform normalization. It takes the column values, reshapes them into a 2D array with a single column using values.reshape(-1, 1), and then applies the scaling transformation. The normalized column is stored in the normalized_column variable.
        normalized_data[column] = normalized_column.flatten()
        #This line assigns the normalized column to the corresponding column in the normalized_data DataFrame. The flatten() method is used to convert the normalized column from a 2D array to a 1D array before assigning it.
    
    else:
        # Show a message that the data is not numeric
        print(f"Column '{column}' is not numeric.")
        #The f in the line print(f"Column '{column}' is not numeric.") is used to format the string with variables.
        #By using an f-string, I can directly embed variables or expressions within curly braces {} inside the string. The value of the variable or expression inside the curly braces is evaluated and inserted into the string.

# Concatenate the normalized features and target variable
normalized_data['Profit'] = target_variable # assigns the values of the target_variable to the 'Profit' column of the normalized_data DataFrame.

# Print the normalized data
print(normalized_data)


Column 'Area' is not numeric.
    Marketing Spend  Administration  Transport     Profit
0          0.692617        0.651744   1.000000  192261.83
1          0.983359        0.761972   0.940893  191792.06
2          0.927985        0.379579   0.864664  191050.39
3          0.873136        0.512998   0.812235  182901.99
4          0.859438        0.305328   0.776136  166187.94
5          0.797566        0.369448   0.769126  156991.12
6          0.814128        0.730161   0.270710  156122.51
7          0.788018        0.717457   0.686493  155752.60
8          0.729018        0.741733   0.660500  152211.77
9          0.745906        0.436929   0.646443  149759.96
10         0.616351        0.451506   0.485733  146121.95
11         0.608845        0.308364   0.529362  144259.40
12         0.567670        0.578836   0.529563  141585.52
13         0.556352        0.641066   0.535552  134307.35
14         0.725394        0.801327   0.543708  132602.65
15         1.000000        0.543030   0.55

Standardization

In [138]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [139]:
#If I want to apply on 'Marketing Spend', then first fit and then transform. They can be performed individually or together.

#fit
df_ss=scaler.fit(df3[['Marketing Spend']]) #Calculate everything
df_ss

In [140]:
df3.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [141]:
#transfrom and replace

df3[['Marketing Spend New']] = scaler.transform(df3[['Marketing Spend']])

In [142]:
df3.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,Marketing Spend New
0,114523.61,136897.8,471784.1,Dhaka,192261.83,0.897913
1,162597.7,151377.59,443898.53,Ctg,191792.06,1.95586
2,153441.51,101145.55,407934.54,Rangpur,191050.39,1.754364
3,144372.41,118671.85,383199.62,Dhaka,182901.99,1.554784
4,142107.34,91391.77,366168.42,Rangpur,166187.94,1.504937


In [143]:
#fit and transform together

df3[['Administration']] = scaler.fit_transform(df3[['Administration']])

In [144]:
df3.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,Marketing Spend New
0,114523.61,0.560753,471784.1,Dhaka,192261.83,0.897913
1,162597.7,1.082807,443898.53,Ctg,191792.06,1.95586
2,153441.51,-0.728257,407934.54,Rangpur,191050.39,1.754364
3,144372.41,-0.096365,383199.62,Dhaka,182901.99,1.554784
4,142107.34,-1.079919,366168.42,Rangpur,166187.94,1.504937


Standardization using Loop and Conditions

In [145]:
target_variable = df4['Profit']

features = df4.drop('Profit', axis=1) 

normalized_data = pd.DataFrame()

for column in features.columns: #loop that iterates over each column in the DataFrame features.
    if pd.to_numeric(features[column], errors='coerce').notnull().any() or features[column].eq(0).any():
        
        # Apply normalization using StandardScaler
        scaler = StandardScaler()
        normalized_column = m.fit_transform(features[column].values.reshape(-1, 1)) 
        normalized_data[column] = normalized_column.flatten()
    
    else:
        # Show a message that the data is not numeric
        print(f"Column '{column}' is not numeric.")
      
# Concatenate the normalized features and target variable
normalized_data['Profit'] = target_variable # assigns the values of the target_variable to the 'Profit' column of the normalized_data DataFrame.

# Print the normalized data
print(normalized_data)


Column 'Area' is not numeric.
    Marketing Spend  Administration  Transport     Profit
0          0.692617        0.651744   1.000000  192261.83
1          0.983359        0.761972   0.940893  191792.06
2          0.927985        0.379579   0.864664  191050.39
3          0.873136        0.512998   0.812235  182901.99
4          0.859438        0.305328   0.776136  166187.94
5          0.797566        0.369448   0.769126  156991.12
6          0.814128        0.730161   0.270710  156122.51
7          0.788018        0.717457   0.686493  155752.60
8          0.729018        0.741733   0.660500  152211.77
9          0.745906        0.436929   0.646443  149759.96
10         0.616351        0.451506   0.485733  146121.95
11         0.608845        0.308364   0.529362  144259.40
12         0.567670        0.578836   0.529563  141585.52
13         0.556352        0.641066   0.535552  134307.35
14         0.725394        0.801327   0.543708  132602.65
15         1.000000        0.543030   0.55

Log Transformation

In [146]:
from sklearn.preprocessing import FunctionTransformer
ft = FunctionTransformer(np.log1p) #np.log1p will avoid 0

In [147]:
#If I want to apply on 'Marketing Spend', then first fit and then transform. They can be performed individually or together.

#fit
df_lt=ft.fit(df5[['Marketing Spend']]) #Calculate everything
df_lt

In [148]:
df5.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [149]:
#transfrom and replace

df5[['Marketing Spend New']] = ft.transform(df5[['Marketing Spend']])

In [150]:
df5.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,Marketing Spend New
0,114523.61,136897.8,471784.1,Dhaka,192261.83,11.648545
1,162597.7,151377.59,443898.53,Ctg,191792.06,11.99904
2,153441.51,101145.55,407934.54,Rangpur,191050.39,11.941081
3,144372.41,118671.85,383199.62,Dhaka,182901.99,11.880158
4,142107.34,91391.77,366168.42,Rangpur,166187.94,11.864345


In [151]:
#fit and transform together

df5[['Administration']] = ft.fit_transform(df5[['Administration']])

In [152]:
df5.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,Marketing Spend New
0,114523.61,11.826997,471784.1,Dhaka,192261.83,11.648545
1,162597.7,11.927539,443898.53,Ctg,191792.06,11.99904
2,153441.51,11.524326,407934.54,Rangpur,191050.39,11.941081
3,144372.41,11.684126,383199.62,Dhaka,182901.99,11.880158
4,142107.34,11.422922,366168.42,Rangpur,166187.94,11.864345


Log Transformation using Loop and Conditions

In [153]:
target_variable = df6['Profit']

features = df6.drop('Profit', axis=1) 

normalized_data = pd.DataFrame()

for column in features.columns: #loop that iterates over each column in the DataFrame features.
    if pd.to_numeric(features[column], errors='coerce').notnull().any() or features[column].eq(0).any():
        
        # Apply normalization using StandardScaler
        ft = FunctionTransformer(np.log1p)
        normalized_column = ft.fit_transform(features[column].values.reshape(-1, 1)) 
        normalized_data[column] = normalized_column.flatten()
    
    else:
        # Show a message that the data is not numeric
        print(f"Column '{column}' is not numeric.")
      
# Concatenate the normalized features and target variable
normalized_data['Profit'] = target_variable # assigns the values of the target_variable to the 'Profit' column of the normalized_data DataFrame.

# Print the normalized data
print(normalized_data)


Column 'Area' is not numeric.
    Marketing Spend  Administration  Transport     Profit
0         11.648545       11.826997  13.064279  192261.83
1         11.999040       11.927539  13.003354  191792.06
2         11.941081       11.524326  12.918864  191050.39
3         11.880158       11.684126  12.856314  182901.99
4         11.864345       11.422922  12.810851  166187.94
5         11.789632       11.511081  12.801779  156991.12
6         11.810185       11.899547  11.757579  156122.51
7         11.777588       11.888145  12.688121  155752.60
8         11.699766       11.909820  12.649521  152211.77
9         11.722667       11.596165  12.628010  149759.96
10        11.531885       11.613631  12.342184  146121.95
11        11.519633       11.427276  12.428198  144259.40
12        11.449610       11.754470  12.428578  141585.52
13        11.429472       11.816698  12.439823  134307.35
14        11.694782       11.961121  12.454938  132602.65
15        12.015821       11.716828  12.47

Robust Scaler

In [154]:
from sklearn.preprocessing import RobustScaler
RoSc = RobustScaler() #robust to outliers

In [155]:
#If I want to apply on 'Marketing Spend', then first fit and then transform. They can be performed individually or together.

#fit
df_rs=RoSc.fit(df7[['Marketing Spend']]) #Calculate everything
df_rs

In [156]:
df7.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [157]:
#transfrom and replace

df7[['Marketing Spend New']] = RoSc.transform(df7[['Marketing Spend']])

In [158]:
df7.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,Marketing Spend New
0,114523.61,136897.8,471784.1,Dhaka,192261.83,0.67253
1,162597.7,151377.59,443898.53,Ctg,191792.06,1.452113
2,153441.51,101145.55,407934.54,Rangpur,191050.39,1.303634
3,144372.41,118671.85,383199.62,Dhaka,182901.99,1.156567
4,142107.34,91391.77,366168.42,Rangpur,166187.94,1.119836


In [159]:
#fit and transform together

df7[['Administration']] = RoSc.fit_transform(df7[['Administration']])

In [160]:
df7.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,Marketing Spend New
0,114523.61,0.345355,471784.1,Dhaka,192261.83,0.67253
1,162597.7,0.697565,443898.53,Ctg,191792.06,1.452113
2,153441.51,-0.52429,407934.54,Rangpur,191050.39,1.303634
3,144372.41,-0.097977,383199.62,Dhaka,182901.99,1.156567
4,142107.34,-0.761543,366168.42,Rangpur,166187.94,1.119836


Robust Scaler using Loop and Conditions

In [161]:
target_variable = df8['Profit']

features = df8.drop('Profit', axis=1) 

normalized_data = pd.DataFrame()

for column in features.columns: #loop that iterates over each column in the DataFrame features.
    if pd.to_numeric(features[column], errors='coerce').notnull().any() or features[column].eq(0).any():
        
        # Apply normalization using StandardScaler
        RoSc = RobustScaler()
        normalized_column = RoSc.fit_transform(features[column].values.reshape(-1, 1)) 
        normalized_data[column] = normalized_column.flatten()
    
    else:
        # Show a message that the data is not numeric
        print(f"Column '{column}' is not numeric.")
      
# Concatenate the normalized features and target variable
normalized_data['Profit'] = target_variable # assigns the values of the target_variable to the 'Profit' column of the normalized_data DataFrame.

# Print the normalized data
print(normalized_data)


Column 'Area' is not numeric.
    Marketing Spend  Administration  Transport     Profit
0          0.672530        0.345355   1.552016  192261.83
1          1.452113        0.697565   1.383714  191792.06
2          1.303634       -0.524290   1.166654  191050.39
3          1.156567       -0.097977   1.017368  182901.99
4          1.119836       -0.761543   0.914576  166187.94
5          0.953936       -0.556662   0.894617  156991.12
6          0.998345        0.595921  -0.524591  156122.51
7          0.928334        0.555328   0.659326  155752.60
8          0.770134        0.632895   0.585311  152211.77
9          0.815416       -0.341041   0.545285  149759.96
10         0.468034       -0.294461   0.087672  146121.95
11         0.447908       -0.751841   0.211904  144259.40
12         0.337504        0.112392   0.212476  141585.52
13         0.307158        0.311235   0.229530  134307.35
14         0.760416        0.823317   0.252754  132602.65
15         1.496732       -0.002018   0.28

Max Absolute Scaler

In [162]:
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()

In [163]:
#If I want to apply on 'Marketing Spend', then first fit and then transform. They can be performed individually or together.

#fit
df_mas=scaler.fit(df9[['Marketing Spend']]) #Calculate everything
df_mas

In [164]:
df9.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [165]:
#transfrom and replace

df9[['Marketing Spend New']] = scaler.transform(df9[['Marketing Spend']])

In [166]:
df9.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,Marketing Spend New
0,114523.61,136897.8,471784.1,Dhaka,192261.83,0.692617
1,162597.7,151377.59,443898.53,Ctg,191792.06,0.983359
2,153441.51,101145.55,407934.54,Rangpur,191050.39,0.927985
3,144372.41,118671.85,383199.62,Dhaka,182901.99,0.873136
4,142107.34,91391.77,366168.42,Rangpur,166187.94,0.859438


In [167]:
#fit and transform together

df9[['Administration']] = scaler.fit_transform(df9[['Administration']])

In [168]:
df9.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit,Marketing Spend New
0,114523.61,0.749527,471784.1,Dhaka,192261.83,0.692617
1,162597.7,0.828805,443898.53,Ctg,191792.06,0.983359
2,153441.51,0.553781,407934.54,Rangpur,191050.39,0.927985
3,144372.41,0.649738,383199.62,Dhaka,182901.99,0.873136
4,142107.34,0.500378,366168.42,Rangpur,166187.94,0.859438


Max Absolute Scaler using Loop and Conditions

In [169]:
target_variable = df10['Profit']

features = df10.drop('Profit', axis=1) 

normalized_data = pd.DataFrame()

for column in features.columns: #loop that iterates over each column in the DataFrame features.
    if pd.to_numeric(features[column], errors='coerce').notnull().any() or features[column].eq(0).any():
        
        # Apply normalization using StandardScaler
        scaler = MaxAbsScaler()
        normalized_column = scaler.fit_transform(features[column].values.reshape(-1, 1)) 
        normalized_data[column] = normalized_column.flatten()
    
    else:
        # Show a message that the data is not numeric
        print(f"Column '{column}' is not numeric.")
      
# Concatenate the normalized features and target variable
normalized_data['Profit'] = target_variable # assigns the values of the target_variable to the 'Profit' column of the normalized_data DataFrame.

# Print the normalized data
print(normalized_data)


Column 'Area' is not numeric.
    Marketing Spend  Administration  Transport     Profit
0          0.692617        0.749527   1.000000  192261.83
1          0.983359        0.828805   0.940893  191792.06
2          0.927985        0.553781   0.864664  191050.39
3          0.873136        0.649738   0.812235  182901.99
4          0.859438        0.500378   0.776136  166187.94
5          0.797566        0.546494   0.769126  156991.12
6          0.814128        0.805926   0.270710  156122.51
7          0.788018        0.796789   0.686493  155752.60
8          0.729018        0.814249   0.660500  152211.77
9          0.745906        0.595028   0.646443  149759.96
10         0.616351        0.605512   0.485733  146121.95
11         0.608845        0.502561   0.529362  144259.40
12         0.567670        0.697090   0.529563  141585.52
13         0.556352        0.741847   0.535552  134307.35
14         0.725394        0.857110   0.543708  132602.65
15         1.000000        0.671338   0.55