In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

# Function to read CSV or Excel file
def read_file(file_path):
    """
    Reads a CSV or Excel file and returns the data as a DataFrame.
    :param file_path: str, the path to the file (csv or excel)
    :return: DataFrame containing the file data
    """
    try:
        if file_path.endswith('.csv'):
            data = pd.read_csv(file_path)
        elif file_path.endswith('.xlsx'):
            data = pd.read_excel(file_path)
        else:
            raise ValueError("Unsupported file format! Please provide a .csv or .xlsx file.")
        
        return data
    
    except Exception as e:
        print(f"Error occurred while reading the file: {e}")
        return None


In [2]:
# Function to clean the data
def clean_data(df):
    """
    Cleans the DataFrame by handling missing values, duplicates, and irrelevant columns.
    :param df: DataFrame, the data to be cleaned
    :return: DataFrame, cleaned data
    """
    try:
        # Dropping duplicates if any
        df_cleaned = df.drop_duplicates()
        
        # Handling missing values (example: filling with mean for numeric data)
        df_cleaned = df_cleaned.fillna(df_cleaned.mean(numeric_only=True))
        
        # Optionally, drop rows with any remaining null values
        df_cleaned = df_cleaned.dropna()

        return df_cleaned
    
    except Exception as e:
        print(f"Error occurred during cleaning: {e}")
        return df


In [3]:
# Function to calculate the mean
def calculate_mean(df, column):
    """
    Calculates the mean (average) of a specified column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to calculate the mean for
    :return: Mean value
    """
    return df[column].mean()

# Function to calculate the median
def calculate_median(df, column):
    """
    Calculates the median (middle value) of a specified column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to calculate the median for
    :return: Median value
    """
    return df[column].median()

# Function to calculate the mode
def calculate_mode(df, column):
    """
    Calculates the mode (most frequent value) of a specified column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to calculate the mode for
    :return: Mode value
    """
    return df[column].mode()[0]

# Function to calculate harmonic mean
def calculate_harmonic_mean(df, column):
    """
    Calculates the harmonic mean of a specified column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to calculate the harmonic mean for
    :return: Harmonic mean value
    """
    return stats.hmean(df[column])

# Function to calculate geometric mean
def calculate_geometric_mean(df, column):
    """
    Calculates the geometric mean of a specified column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to calculate the geometric mean for
    :return: Geometric mean value
    """
    return stats.gmean(df[column])

In [4]:
# Function to calculate the range
def calculate_range(df, column):
    """
    Calculates the range (max - min) of a specified column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to calculate the range for
    :return: Range value
    """
    return df[column].max() - df[column].min()

# Function to calculate variance
def calculate_variance(df, column):
    """
    Calculates the variance of a specified column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to calculate the variance for
    :return: Variance value
    """
    return df[column].var()

# Function to calculate standard deviation
def calculate_standard_deviation(df, column):
    """
    Calculates the standard deviation of a specified column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to calculate the standard deviation for
    :return: Standard deviation value
    """
    return df[column].std()

# Function to calculate Interquartile Range (IQR)
def calculate_iqr(df, column):
    """
    Calculates the Interquartile Range (IQR) of a specified column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to calculate the IQR for
    :return: IQR value
    """
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    return Q3 - Q1

# Function to calculate coefficient of variation
def calculate_coefficient_of_variation(df, column):
    """
    Calculates the coefficient of variation of a specified column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to calculate the coefficient of variation for
    :return: Coefficient of variation value
    """
    mean = df[column].mean()
    std = df[column].std()
    return (std / mean) * 100


In [5]:
# Function to calculate percentiles
def calculate_percentiles(df, column, percentile):
    """
    Calculates a specific percentile of a specified column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to calculate the percentile for
    :param percentile: Percentile value to calculate (0-100)
    :return: Percentile value
    """
    return df[column].quantile(percentile / 100)

# Function to calculate quartiles
def calculate_quartiles(df, column):
    """
    Calculates the first, second, and third quartiles of a specified column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to calculate the quartiles for
    :return: Tuple of (Q1, Q2 (median), Q3)
    """
    Q1 = df[column].quantile(0.25)
    Q2 = df[column].quantile(0.50)
    Q3 = df[column].quantile(0.75)
    return (Q1, Q2, Q3)

# Function to calculate deciles
def calculate_deciles(df, column):
    """
    Calculates the deciles (dividing data into 10 equal parts) of a specified column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to calculate the deciles for
    :return: List of decile values
    """
    return [df[column].quantile(i / 10) for i in range(1, 10)]


In [6]:
# Function to calculate skewness
def calculate_skewness(df, column):
    """
    Calculates the skewness of a specified column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to calculate skewness for
    :return: Skewness value
    """
    return df[column].skew()

# Function to calculate kurtosis
def calculate_kurtosis(df, column):
    """
    Calculates the kurtosis of a specified column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to calculate kurtosis for
    :return: Kurtosis value
    """
    return df[column].kurt()

# Function to plot skewness
def plot_skewness(df, column):
    plt.figure(figsize=(8, 6))
    sns.histplot(df[column], kde=True, color='blue')
    skewness_value = calculate_skewness(df, column)
    plt.title(f"Skewness of {column} (Skewness = {skewness_value:.2f})")
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.axvline(df[column].mean(), color='red', linestyle='dashed', linewidth=1, label='Mean')
    plt.legend()
    plt.show()

# Function to plot kurtosis
def plot_kurtosis(df, column):
    plt.figure(figsize=(8, 6))
    sns.histplot(df[column], kde=True, color='green')
    kurtosis_value = calculate_kurtosis(df, column)
    plt.title(f"Kurtosis of {column} (Kurtosis = {kurtosis_value:.2f})")
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.axvline(df[column].mean(), color='red', linestyle='dashed', linewidth=1, label='Mean')
    plt.legend()
    plt.show()

In [7]:
# Function to plot histogram
def plot_histogram(df, column):
    """
    Plots a histogram for a specified column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to plot histogram for
    """
    plt.figure(figsize=(8,6))
    sns.histplot(df[column], bins=10, kde=False)
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

# Function to plot box plot (Box-and-Whisker Plot)
def plot_boxplot(df, column):
    """
    Plots a box plot for a specified column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to plot boxplot for
    """
    plt.figure(figsize=(8,6))
    sns.boxplot(x=df[column])
    plt.title(f'Boxplot of {column}')
    plt.show()

# Function to plot bar chart
def plot_bar_chart(df, column):
    """
    Plots a bar chart for a specified categorical column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to plot bar chart for
    """
    plt.figure(figsize=(8,6))
    df[column].value_counts().plot(kind='bar')
    plt.title(f'Bar Chart of {column}')
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.show()

# Function to plot pie chart
def plot_pie_chart(df, column):
    """
    Plots a pie chart for a specified categorical column in the DataFrame.
    :param df: DataFrame containing the data
    :param column: Column name to plot pie chart for
    """
    plt.figure(figsize=(8,6))
    df[column].value_counts().plot(kind='pie', autopct='%1.1f%%')
    plt.title(f'Pie Chart of {column}')
    plt.ylabel('')
    plt.show()

# Function to plot scatter plot
def plot_scatter_plot(df, column_x, column_y):
    """
    Plots a scatter plot for two numerical columns in the DataFrame.
    :param df: DataFrame containing the data
    :param column_x: Column name for X-axis
    :param column_y: Column name for Y-axis
    """
    plt.figure(figsize=(8,6))
    plt.scatter(df[column_x], df[column_y])
    plt.title(f'Scatter Plot of {column_x} vs {column_y}')
    plt.xlabel(column_x)
    plt.ylabel(column_y)
    plt.show()

# Function to plot line chart
def plot_line_chart(df, column_x, column_y):
    """
    Plots a line chart for two numerical columns in the DataFrame.
    :param df: DataFrame containing the data
    :param column_x: Column name for X-axis (e.g., time series)
    :param column_y: Column name for Y-axis (values)
    """
    plt.figure(figsize=(8,6))
    plt.plot(df[column_x], df[column_y])
    plt.title(f'Line Chart of {column_x} vs {column_y}')
    plt.xlabel(column_x)
    plt.ylabel(column_y)
    plt.show()

In [None]:
# Testing main function
def main():
    # Ask user for the file path
    file_path = input("Enter the file path (csv or xlsx): ").strip()

    # Validate file extension
    if not file_path.endswith(('.csv', '.xlsx')):
        print("Invalid file type! Please enter a valid CSV or Excel (.xlsx) file.")
        return

    # Load the file
    df = read_file(file_path)
    
    if df is not None:
        print("Data Loaded Successfully!")
        print("Here is a preview of the data:")
        display(df.head())

        # Clean the data
        df_cleaned = clean_data(df)
        print("Data has been cleaned.")
        print("Here is a preview of the cleaned data:")
        display(df_cleaned.head())

        # Display available columns for user to choose from
        available_columns = df_cleaned.columns.tolist()
        print(f"\nAvailable columns: {', '.join(available_columns)}")

        # Ask the user to choose a valid column for analysis
        while True:
            column = input("Choose a column for analysis: ").strip()
            if column in available_columns:
                break
            else:
                print(f"Column '{column}' not found. Please choose from the available columns.")

        # Display available calculations for the user to choose
        print("\nWhich analysis would you like to perform?")
        print("1. Measures of Central Tendency (Mean, Median, Mode, Harmonic Mean, Geometric Mean)")
        print("2. Measures of Dispersion (Range, Variance, Standard Deviation, IQR, Coefficient of Variation)")
        print("3. Measures of Position (Percentiles, Quartiles, Deciles)")
        print("4. Measures of Shape (Skewness, Kurtosis)")
        print("5. Graphical Methods (Histogram, Boxplot, Bar Chart, Pie Chart, Scatter Plot, Line Chart)")
        
        analysis_choice = input("\nEnter the number corresponding to your choice: ").strip()

        # Perform analysis based on user input
        if analysis_choice == "1":
            # Central Tendency
            print(f"Mean: {calculate_mean(df_cleaned, column)}")
            print(f"Median: {calculate_median(df_cleaned, column)}")
            print(f"Mode: {calculate_mode(df_cleaned, column)}")
            print(f"Harmonic Mean: {calculate_harmonic_mean(df_cleaned, column)}")
            print(f"Geometric Mean: {calculate_geometric_mean(df_cleaned, column)}")

        elif analysis_choice == "2":
            # Dispersion
            print(f"Range: {calculate_range(df_cleaned, column)}")
            print(f"Variance: {calculate_variance(df_cleaned, column)}")
            print(f"Standard Deviation: {calculate_standard_deviation(df_cleaned, column)}")
            print(f"IQR: {calculate_iqr(df_cleaned, column)}")
            print(f"Coefficient of Variation: {calculate_coefficient_of_variation(df_cleaned, column)}")

        elif analysis_choice == "3":
            # Position
            percentile = int(input("Enter the percentile (e.g., 90 for 90th percentile): ").strip())
            print(f"{percentile}th Percentile: {calculate_percentiles(df_cleaned, column, percentile)}")
            print(f"Quartiles: {calculate_quartiles(df_cleaned, column)}")
            print(f"Deciles: {calculate_deciles(df_cleaned, column)}")

        elif analysis_choice == "4":
            # Shape
            print(f"Skewness: {calculate_skewness(df_cleaned, column)}")
            print(f"Kurtosis: {calculate_kurtosis(df_cleaned, column)}")

            # Plot Skewness and Kurtosis
            plot_skewness(df_cleaned, column)
            plot_kurtosis(df_cleaned, column)

        elif analysis_choice == "5":
            # Graphical Methods
            print("\nWhich graphical method would you like to visualize?")
            print("1. Histogram")
            print("2. Box Plot")
            print("3. Bar Chart")
            print("4. Pie Chart")
            print("5. Scatter Plot")
            print("6. Line Chart")
            graph_choice = input("Enter the number corresponding to your choice: ").strip()

            if graph_choice == "1":
                plot_histogram(df_cleaned, column)
            elif graph_choice == "2":
                plot_boxplot(df_cleaned, column)
            elif graph_choice == "3":
                plot_bar_chart(df_cleaned, column)
            elif graph_choice == "4":
                plot_pie_chart(df_cleaned, column)
            elif graph_choice == "5":
                # For scatter plot and line chart, we need an additional column
                column_x = input(f"Choose another column for the X-axis: ({', '.join(available_columns)}) ").strip()
                if column_x in available_columns:
                    plot_scatter_plot(df_cleaned, column_x, column)
                else:
                    print(f"Column '{column_x}' not found.")
            elif graph_choice == "6":
                # For scatter plot and line chart, we need an additional column
                column_x = input(f"Choose another column for the X-axis: ({', '.join(available_columns)}) ").strip()
                if column_x in available_columns:
                    plot_line_chart(df_cleaned, column_x, column)
                else:
                    print(f"Column '{column_x}' not found.")
            else:
                print("Invalid graphical method choice.")
        else:
            print("Invalid analysis choice. Please restart the program.")

# Call the main function
main()


Enter the file path (csv or xlsx): C:\Users\Asus\Desktop\ClassicHit.csv
Data Loaded Successfully!
Here is a preview of the data:


Unnamed: 0,Track,Artist,Year,Duration,Time_Signature,Danceability,Energy,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Popularity,Genre
0,Hey Jack Kerouac,"10,000 Maniacs",1987,206413,4,0.616,0.511,6,-15.894,1,0.0279,0.0384,0.0,0.15,0.604,132.015,40,Alt. Rock
1,Like the Weather,"10,000 Maniacs",1987,236653,4,0.77,0.459,1,-17.453,1,0.0416,0.112,0.00343,0.145,0.963,133.351,43,Alt. Rock
2,What's the Matter Here?,"10,000 Maniacs",1987,291173,4,0.593,0.816,9,-7.293,1,0.041,0.00449,3.2e-05,0.0896,0.519,99.978,12,Alt. Rock
3,Trouble Me,"10,000 Maniacs",1989,193560,4,0.861,0.385,2,-10.057,1,0.0341,0.154,0.0,0.123,0.494,117.913,47,Alt. Rock
4,Candy Everybody Wants,"10,000 Maniacs",1992,185960,4,0.622,0.876,10,-6.31,1,0.0305,0.0193,0.00684,0.0987,0.867,104.97,43,Alt. Rock


Data has been cleaned.
Here is a preview of the cleaned data:


Unnamed: 0,Track,Artist,Year,Duration,Time_Signature,Danceability,Energy,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Popularity,Genre
0,Hey Jack Kerouac,"10,000 Maniacs",1987,206413,4,0.616,0.511,6,-15.894,1,0.0279,0.0384,0.0,0.15,0.604,132.015,40,Alt. Rock
1,Like the Weather,"10,000 Maniacs",1987,236653,4,0.77,0.459,1,-17.453,1,0.0416,0.112,0.00343,0.145,0.963,133.351,43,Alt. Rock
2,What's the Matter Here?,"10,000 Maniacs",1987,291173,4,0.593,0.816,9,-7.293,1,0.041,0.00449,3.2e-05,0.0896,0.519,99.978,12,Alt. Rock
3,Trouble Me,"10,000 Maniacs",1989,193560,4,0.861,0.385,2,-10.057,1,0.0341,0.154,0.0,0.123,0.494,117.913,47,Alt. Rock
4,Candy Everybody Wants,"10,000 Maniacs",1992,185960,4,0.622,0.876,10,-6.31,1,0.0305,0.0193,0.00684,0.0987,0.867,104.97,43,Alt. Rock



Available columns: Track, Artist, Year, Duration, Time_Signature, Danceability, Energy, Key, Loudness, Mode, Speechiness, Acousticness, Instrumentalness, Liveness, Valence, Tempo, Popularity, Genre
