<a href="https://colab.research.google.com/github/Shantnu-Talokar/ML_Lab_ShantnuTalokar/blob/main/ML_Practical2%26BonusAssignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

def import_sales_data(file_path):
    try:
        df = pd.read_csv(file_path, encoding='ISO-8859-1')
        print(f"Data imported successfully from {file_path}")
        return df
    except FileNotFoundError:
        print(f"The file at {file_path} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def clean_and_modify_data(df):

    print("Data preview before cleaning:")
    print(df.head())

    if 'Sales' in df.columns and 'Quantity' in df.columns and 'Discount' in df.columns:
        df['Total_Sales_After_Discount'] = df['Sales'] * (1 - df['Discount'])
        print("Total Sales After Discount column added successfully.")
    else:
        print("Required columns 'Sales', 'Quantity', or 'Discount' are missing.")

    df.fillna({'Sales': 0, 'Quantity': 0, 'Discount': 0, 'Profit': 0}, inplace=True)

    if 'Order Date' in df.columns:
        df['Order Date'] = pd.to_datetime(df['Order Date'], errors='coerce')

    df.dropna(subset=['Order ID', 'Customer ID'], inplace=True)

    print("Data after cleaning and modification:")
    print(df.head())
    return df

def export_sales_data(df, output_file_path):
    try:
        df.to_csv(output_file_path, index=False)
        print(f"Cleaned data exported successfully to {output_file_path}")
    except Exception as e:
        print(f"An error occurred during export: {e}")

def main():
    input_file_path = 'SampleSuperstore.csv'
    output_file_path = 'cleaned_sales_data.csv'

    df = import_sales_data(input_file_path)

    if df is not None:
        cleaned_df = clean_and_modify_data(df)
        export_sales_data(cleaned_df, output_file_path)

if __name__ == "__main__":
    main()


Data imported successfully from SampleSuperstore.csv
Data preview before cleaning:
   Row ID        Order ID  Order Date   Ship Date       Ship Mode Customer ID  \
0       1  CA-2016-152156   11/8/2016  11/11/2016    Second Class    CG-12520   
1       2  CA-2016-152156   11/8/2016  11/11/2016    Second Class    CG-12520   
2       3  CA-2016-138688   6/12/2016   6/16/2016    Second Class    DV-13045   
3       4  US-2015-108966  10/11/2015  10/18/2015  Standard Class    SO-20335   
4       5  US-2015-108966  10/11/2015  10/18/2015  Standard Class    SO-20335   

     Customer Name    Segment        Country             City  ...  \
0      Claire Gute   Consumer  United States        Henderson  ...   
1      Claire Gute   Consumer  United States        Henderson  ...   
2  Darrin Van Huff  Corporate  United States      Los Angeles  ...   
3   Sean O'Donnell   Consumer  United States  Fort Lauderdale  ...   
4   Sean O'Donnell   Consumer  United States  Fort Lauderdale  ...   

  Postal 

**Bonus Assignment**

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def import_and_clean_data(file_path):
    try:
        df = pd.read_csv(file_path)

        print("Initial Data Preview:")
        print(df.head())

        print("\nMissing values before cleaning:")
        print(df.isnull().sum())

        df['Age'].fillna(df['Age'].median(), inplace=True)

        df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

        df.drop('Cabin', axis=1, inplace=True)

        df.dropna(subset=['Survived', 'Fare'], inplace=True)

        print("\nMissing values after cleaning:")
        print(df.isnull().sum())

        return df

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def perform_eda(df):
    print("\nBasic Statistics:")
    print(df.describe())
    print("\nCorrelation Matrix:")
    numerical_df = df.select_dtypes(include=np.number)
    correlation_matrix = numerical_df.corr()
    print(correlation_matrix)

    plt.figure(figsize=(10, 6))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Matrix')
    plt.show()

    plt.figure(figsize=(8, 6))
    sns.histplot(df['Age'], kde=True, bins=30)
    plt.title('Distribution of Age')
    plt.xlabel('Age')
    plt.ylabel('Frequency')
    plt.show()

    plt.figure(figsize=(8, 6))
    sns.histplot(df['Fare'], kde=True, bins=30)
    plt.title('Distribution of Fare')
    plt.xlabel('Fare')
    plt.ylabel('Frequency')
    plt.show()

    plt.figure(figsize=(8, 6))
    sns.countplot(x='Survived', data=df)
    plt.title('Survival Count (0 = No, 1 = Yes)')
    plt.xlabel('Survived')
    plt.ylabel('Count')
    plt.show()

def export_cleaned_data(df, output_file_path):
    try:
        df.to_csv(output_file_path, index=False)
        print(f"Cleaned data exported successfully to {output_file_path}")
    except Exception as e:
        print(f"An error occurred during export: {e}")

def main():
    input_file_path = 'titanic.csv'
    output_file_path = 'cleaned_titanic_data.csv'
    df = import_and_clean_data(input_file_path)

    if df is not None:
        perform_eda(df)
        export_cleaned_data(df, output_file_path)

if __name__ == "__main__":
    main()


Initial Data Preview:
   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  

Missing values before cleaning:
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class       