In [1]:
import pandas as pd
import numpy as np


In [2]:
# Load the preprocessed dataset
dataset = pd.read_csv('Processed_Data-1.csv')

In [3]:
# Display basic information about the dataset
print("Dataset Information:")
dataset.info()

Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   0               25000 non-null  object
 1   product_wg_ton  25000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 390.8+ KB


In [4]:
# Display the column names
print("\nColumn Names:")
print(dataset.columns)


Column Names:
Index(['0', 'product_wg_ton'], dtype='object')


In [5]:
# Identify quantitative and qualitative columns
def identify_column_types(dataset):
    quan = []
    qual = []
    for columnName in dataset.columns:
        if dataset[columnName].dtype == 'O':
            qual.append(columnName)
        else:
            quan.append(columnName)
    return quan, qual

In [6]:
# Identify column types
quan, qual = identify_column_types(dataset)


In [7]:
# Display quantitative and qualitative columns
print("\nQuantitative Columns:")
print(quan)

print("\nQualitative Columns:")
print(qual)


Quantitative Columns:
['product_wg_ton']

Qualitative Columns:
['0']


In [8]:
# Calculate descriptive statistics for quantitative columns
def calculate_descriptive_statistics(dataset, quan):
    descriptive = pd.DataFrame(index=["Mean", "Median", "Mode", "Q1:25%", "Q2:50%", "Q3:75%", "99%", "Q4:100%", "IQR", "1.5rule", "Lesser", "Greater", "Min", "Max", "kurtosis", "skew", "Var", "Std"], columns=quan)
    for columnName in quan:
        descriptive[columnName]["Mean"] = dataset[columnName].mean()
        descriptive[columnName]["Median"] = dataset[columnName].median()
        descriptive[columnName]["Mode"] = dataset[columnName].mode()[0]
        descriptive[columnName]["Q1:25%"] = dataset.describe()[columnName]["25%"]
        descriptive[columnName]["Q2:50%"] = dataset.describe()[columnName]["50%"]
        descriptive[columnName]["Q3:75%"] = dataset.describe()[columnName]["75%"]
        descriptive[columnName]["99%"] = np.percentile(dataset[columnName], 99)
        descriptive[columnName]["Q4:100%"] = dataset.describe()[columnName]["max"]
        descriptive[columnName]["IQR"] = descriptive[columnName]["Q3:75%"] - descriptive[columnName]["Q1:25%"]
        descriptive[columnName]["1.5rule"] = 1.5 * descriptive[columnName]["IQR"]
        descriptive[columnName]["Lesser"] = descriptive[columnName]["Q1:25%"] - descriptive[columnName]["1.5rule"]
        descriptive[columnName]["Greater"] = descriptive[columnName]["Q3:75%"] + descriptive[columnName]["1.5rule"]
        descriptive[columnName]["Min"] = dataset[columnName].min()
        descriptive[columnName]["Max"] = dataset[columnName].max()
        descriptive[columnName]["kurtosis"] = dataset[columnName].kurtosis()
        descriptive[columnName]["skew"] = dataset[columnName].skew()
        descriptive[columnName]["Var"] = dataset[columnName].var()
        descriptive[columnName]["Std"] = dataset[columnName].std()
    return descriptive

In [10]:
# Calculate descriptive statistics
descriptive_stats = calculate_descriptive_statistics(dataset, quan)


In [11]:
# Display the descriptive statistics
print("\nDescriptive Statistics:")
print(descriptive_stats)


Descriptive Statistics:
            product_wg_ton
Mean           22102.63292
Median             22101.0
Mode                  5146
Q1:25%             13059.0
Q2:50%             22101.0
Q3:75%             30103.0
99%               51075.02
Q4:100%            55151.0
IQR                17044.0
1.5rule            25566.0
Lesser            -12507.0
Greater            55669.0
Min                   2065
Max                  55151
kurtosis         -0.502022
skew              0.331631
Var       134739977.918249
Std           11607.755077


In [12]:
# Create a frequency table for a specified column
def create_frequency_table(columnName, dataset):
    freqTable = pd.DataFrame(columns=["Unique_Values", "Frequency", "Relative Frequency", "Cusum"])
    freqTable["Unique_Values"] = dataset[columnName].value_counts().index
    freqTable["Frequency"] = dataset[columnName].value_counts().values
    freqTable["Relative Frequency"] = (freqTable["Frequency"] / len(dataset))
    freqTable["Cusum"] = freqTable["Relative Frequency"].cumsum()
    return freqTable

In [13]:
# Example frequency table for a qualitative column
if qual:
    example_freq_table = create_frequency_table(qual[0], dataset)
    print(f"\nFrequency Table for '{qual[0]}':")
    print(example_freq_table)



Frequency Table for '0':
                                           Unique_Values  Frequency  \
0        (0, 0)\t-0.4178074753217756\n  (0, 1)\t0.188...          1   
1        (0, 0)\t-0.4178074753217756\n  (0, 1)\t0.188...          1   
2        (0, 0)\t-0.034159973557124454\n  (0, 1)\t-0....          1   
3        (0, 0)\t1.50043003350148\n  (0, 1)\t-0.64504...          1   
4        (0, 0)\t0.7331350299721778\n  (0, 1)\t-0.645...          1   
...                                                  ...        ...   
24995    (0, 0)\t-1.568749980615729\n  (0, 1)\t0.1886...          1   
24996    (0, 0)\t1.116782531736829\n  (0, 1)\t-0.6450...          1   
24997    (0, 0)\t-1.568749980615729\n  (0, 1)\t-0.645...          1   
24998    (0, 0)\t-1.1851024788510778\n  (0, 1)\t-0.64...          1   
24999    (0, 0)\t1.50043003350148\n  (0, 1)\t1.022423...          1   

       Relative Frequency    Cusum  
0                 0.00004  0.00004  
1                 0.00004  0.00008  
2         

In [14]:
# Example skewness and kurtosis for a quantitative column
if quan:
    example_column = quan[0]
    skewness = dataset[example_column].skew()
    kurtosis = dataset[example_column].kurtosis()
    print(f"\nSkewness of '{example_column}': {skewness}")
    print(f"Kurtosis of '{example_column}': {kurtosis}")


Skewness of 'product_wg_ton': 0.33163104430999646
Kurtosis of 'product_wg_ton': -0.5020221997549963
