In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
#from google.colab import files  # For Google Colab file upload
import io

In [2]:
# Function to load data from various sources
def load_data():
    try:
        # For Google Colab
        uploaded = files.upload()
        file_name = list(uploaded.keys())[0]
        file_content = uploaded[file_name]
        
        # Determine file type and read accordingly
        if file_name.endswith('.csv'):
            return pd.read_csv(io.BytesIO(file_content))
        elif file_name.endswith('.json'):
            return pd.read_json(io.BytesIO(file_content))
        elif file_name.endswith(('.xlsx', '.xls')):
            return pd.read_excel(io.BytesIO(file_content))
        else:
            raise ValueError(f"Unsupported file format: {file_name}")

SyntaxError: incomplete input (3262364447.py, line 17)

In [2]:
def load_data():
    try:
        # For Jupyter Notebook (local environment)
        file_path = input("Enter the path to your dataset: ")
        file_extension = file_path.split('.')[-1].lower()
        
        if file_extension == 'csv':
            return pd.read_csv(file_path)
        elif file_extension == 'json':
            return pd.read_json(file_path)
        elif file_extension in ['xlsx', 'xls']:
            return pd.read_excel(file_path)
        else:
            raise ValueError(f"Unsupported file format: {file_extension}")
    
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
    except ValueError as ve:
        print(ve)
    except Exception as e:
        print(f"An error occurred: {e}")


In [3]:
def clean_data(data):
    return data.dropna()

In [4]:
def linear_regression(data, x_column, y_column):
    X = data[[x_column]]
    y = data[y_column]
    model = LinearRegression()
    model.fit(X, y)
    predictions = model.predict(X)
    return model, predictions

In [5]:
def kmeans_clustering(data, columns, n_clusters=3):
    X = data[columns]
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    return kmeans.fit_predict(X)

In [6]:
def pca_analysis(data, columns, n_components=2):
    X = data[columns]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    pca = PCA(n_components=n_components)
    return pca.fit_transform(X_scaled)

In [7]:
def generate_report(data):
    return data.describe()

In [8]:
def plot_data(data, x_column, y_column, predictions):
    plt.figure(figsize=(10, 6))
    plt.scatter(data[x_column], data[y_column], color='blue', label='Actual')
    plt.plot(data[x_column], predictions, color='red', label='Predicted')
    plt.xlabel(x_column)
    plt.ylabel(y_column)
    plt.title(f'{y_column} vs {x_column}')
    plt.legend()
    plt.show()

In [9]:
# Main analysis function
def analyze_olympic_data():
    print("Olympic Data Analysis Tool")
    

In [12]:
# Load and clean data
data = load_data()
data = clean_data(data)
    
print("\nData loaded successfully. Shape:", data.shape)
print("\nColumns:", data.columns.tolist())

Enter the path to your dataset: C:\Users\Sakthi\olympics2024.csv

Data loaded successfully. Shape: (91, 7)

Columns: ['Rank', 'Country', 'Country Code', 'Gold', 'Silver', 'Bronze', 'Total']


In [13]:
def main():
    while True:
        print("\nChoose an analysis option:")
        print("1. Linear Regression")
        print("2. K-Means Clustering")
        print("3. PCA Analysis")
        print("4. Generate Report")
        print("5. Exit")
        
        choice = input("Enter your choice (1-5): ")
        
        if choice == '1':
            x_column = input("Enter the independent variable column name: ")
            y_column = input("Enter the dependent variable column name: ")
            model, predictions = linear_regression(data, x_column, y_column)
            print(f"\nCoefficient: {model.coef_[0]}, Intercept: {model.intercept_}")
            plot_data(data, x_column, y_column, predictions)
        
        elif choice == '2':
            columns = input("Enter column names for clustering (comma-separated): ").split(',')
            n_clusters = int(input("Enter the number of clusters: "))
            labels = kmeans_clustering(data, columns, n_clusters)
            data['Cluster'] = labels
            print("\nClustering complete. 'Cluster' column added to the dataset.")
            print(data[['Cluster'] + columns].head())
        
        elif choice == '3':
            columns = input("Enter column names for PCA (comma-separated): ").split(',')
            pca_result = pca_analysis(data, columns)
            data['PCA1'] = pca_result[:, 0]
            data['PCA2'] = pca_result[:, 1]
            print("\nPCA complete. 'PCA1' and 'PCA2' columns added to the dataset.")
            print(data[['PCA1', 'PCA2'] + columns].head())
            
        # Visualize PCA results
            plt.figure(figsize=(10, 6))
            plt.scatter(data['PCA1'], data['PCA2'])
            plt.xlabel('PCA1')
            plt.ylabel('PCA2')
            plt.title('PCA Results')
            plt.show()
        elif choice == '4':
            report = generate_report(data)
            print("\nDescriptive Statistics Report:")
            display(report)  # Use display for better formatting in notebooks
        
        elif choice == '5':
            print("Exiting the program. Goodbye!")
            break
        
        else:
            print("Invalid choice. Please try again.")    

In [15]:
if __name__ == "__main__":
    main()


Choose an analysis option:
1. Linear Regression
2. K-Means Clustering
3. PCA Analysis
4. Generate Report
5. Exit
Enter your choice (1-5): 5
Exiting the program. Goodbye!


In [None]:
# Run the analysis
analyze_olympic_data()