<a href="https://colab.research.google.com/github/Salhoni/Data-analysis-tool/blob/main/dataanalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats
from ipywidgets import interact, widgets, HBox, VBox
from google.colab import files

# File upload
def load_data():
    uploaded = files.upload()  # Allows file upload
    file_name = next(iter(uploaded))  # Get the uploaded file name

    # Detect file type and load data accordingly
    if file_name.endswith('.csv'):
        df = pd.read_csv(file_name)
    elif file_name.endswith('.xlsx'):
        df = pd.read_excel(file_name)
    elif file_name.endswith('.txt'):
        df = pd.read_csv(file_name, delimiter='\t')
    else:
        raise ValueError("Unsupported file format! Please upload CSV, Excel, or TXT file.")

    return df

# Load dataset
df = load_data()

# Display dataset
print("### Dataset Preview")
print(df.head())

# Display summary statistics
print("### Summary Statistics")
print(df.describe())

# Handle Missing Values
def handle_missing_values(option):
    if option == 'Drop rows':
        df_cleaned = df.dropna()
    elif option == 'Fill with mean':
        df_cleaned = df.fillna(df.mean())
    elif option == 'Fill with median':
        df_cleaned = df.fillna(df.median())
    print("Updated Data after Missing Values Handling")
    print(df_cleaned.head())

# Missing values options
missing_options = ['Drop rows', 'Fill with mean', 'Fill with median']
missing_values_dropdown = widgets.Dropdown(options=missing_options, description='Missing Values:')
widgets.interactive(handle_missing_values, option=missing_values_dropdown)

# Visualization Functions
def visualize_data(plot_type, x_col, y_col):
    if plot_type == "Line Plot":
        plt.figure(figsize=(10, 5))
        plt.plot(df[x_col], df[y_col], color="green")
        plt.title(f'Line Plot of {y_col} vs {x_col}')
        plt.xlabel(x_col)
        plt.ylabel(y_col)
        plt.show()

    elif plot_type == "Bar Chart":
        plt.figure(figsize=(10, 5))
        plt.bar(df[x_col], df[y_col], color="orange")
        plt.title(f'Bar Chart of {y_col} vs {x_col}')
        plt.xlabel(x_col)
        plt.ylabel(y_col)
        plt.show()

    elif plot_type == "Scatter Plot":
        fig = px.scatter(df, x=x_col, y=y_col)
        fig.show()

    elif plot_type == "Box Plot":
        fig = px.box(df, x=x_col, y=y_col)
        fig.show()

    elif plot_type == "Histogram":
        fig = px.histogram(df, x=x_col)
        fig.show()

    elif plot_type == "Correlation Heatmap":
        plt.figure(figsize=(10, 8))
        sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
        plt.title('Correlation Heatmap')
        plt.show()

# Interactive Widgets for Visualization
plot_types = ['Line Plot', 'Bar Chart', 'Scatter Plot', 'Box Plot', 'Histogram', 'Correlation Heatmap']
x_col = widgets.Dropdown(options=df.columns, description='X-axis:')
y_col = widgets.Dropdown(options=df.columns, description='Y-axis:')
plot_type = widgets.Dropdown(options=plot_types, description='Plot Type:')
widgets.interactive(visualize_data, plot_type=plot_type, x_col=x_col, y_col=y_col)

# Linear Regression Modeling
def linear_regression_model(x_col, y_col):
    X = df[[x_col]].values.reshape(-1, 1)
    y = df[y_col].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LinearRegression()
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)
    print("### Model Performance")
    print("Mean Squared Error:", mean_squared_error(y_test, predictions))
    print("R-squared:", r2_score(y_test, predictions))

    plt.figure(figsize=(10, 5))
    plt.scatter(y_test, predictions)
    plt.plot(y_test, y_test, color='r')
    plt.title('Predictions vs Actual')
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.show()

# Interactive Widgets for Linear Regression
widgets.interactive(linear_regression_model, x_col=x_col, y_col=y_col)

# Advanced Statistical Analysis
def run_t_test(col1, col2):
    t_stat, p_value = stats.ttest_ind(df[col1].dropna(), df[col2].dropna())
    print(f"T-Statistic: {t_stat}")
    print(f"P-Value: {p_value}")

def run_chi_square(cat_col1, cat_col2):
    contingency_table = pd.crosstab(df[cat_col1], df[cat_col2])
    chi2, p, dof, ex = stats.chi2_contingency(contingency_table)
    print(f"Chi-Square Statistic: {chi2}")
    print(f"P-Value: {p}")

def run_anova(anova_col, numeric_col):
    grouped_data = [group[numeric_col] for name, group in df.groupby(anova_col)]
    f_stat, p_value = stats.f_oneway(*grouped_data)
    print(f"F-Statistic: {f_stat}")
    print(f"P-Value: {p_value}")


# ... (previous code) ...

# Widgets for Advanced Statistical Analysis
t_test_cols = widgets.SelectMultiple(options=df.select_dtypes(include=np.number).columns, description='T-Test Columns:')

# Modified interactive function to handle SelectMultiple values
def run_t_test_modified(t_test_cols):
    if len(t_test_cols) >= 2:  # Ensure at least 2 columns are selected
        col1, col2 = t_test_cols[:2]  # Get the first two selected columns
        t_stat, p_value = stats.ttest_ind(df[col1].dropna(), df[col2].dropna())
        print(f"T-Statistic: {t_stat}")
        print(f"P-Value: {p_value}")
    else:
        print("Please select at least 2 columns for T-test.")

# Use the modified function with interactive
widgets.interactive(run_t_test_modified, t_test_cols=t_test_cols)

# ... (rest of the code) ...

Saving trunk.csv to trunk.csv
### Dataset Preview
   trunk
0     11
1     11
2     12
3     16
4     20
### Summary Statistics
           trunk
count  74.000000
mean   13.756757
std     4.277404
min     5.000000
25%    10.250000
50%    14.000000
75%    16.750000
max    23.000000


interactive(children=(SelectMultiple(description='T-Test Columns:', options=('trunk',), value=()), Output()), …