# DATA ANALYSIS AND DATA SCIENCE WITH PYTHON
## TASK 2: Exploratory Data Analysis (EDA) and Sales Performance Analysis

**Objective**: Perform comprehensive EDA and build a sales prediction model

In [None]:
# Import all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Configure visualization settings
%matplotlib inline
plt.style.use('seaborn')
sns.set(style="whitegrid", palette="muted")
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

## 1. Data Loading and Cleaning

In [None]:
def load_and_clean_data(filepath):
    """Load and preprocess the dataset"""
    try:
        # Load the dataset
        df = pd.read_csv(filepath)
        
        # Initial inspection
        print(f"\n{'='*50}\nInitial Data Inspection\n{'='*50}")
        display(df.head(3))
        print(f"\nShape: {df.shape}")
        print("\nMissing values:")
        display(df.isnull().sum())
        
        # Handle missing values
        df = df.dropna()
        
        # Remove duplicates
        df = df.drop_duplicates()
        
        # Remove outliers (for numerical columns)
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            q1 = df[col].quantile(0.25)
            q3 = df[col].quantile(0.75)
            iqr = q3 - q1
            df = df[(df[col] >= q1 - 1.5*iqr) & (df[col] <= q3 + 1.5*iqr)]
        
        # Final check
        print(f"\n{'='*50}\nAfter Cleaning\n{'='*50}")
        print(f"New shape: {df.shape}")
        print("\nMissing values after cleaning:")
        display(df.isnull().sum())
        
        return df
    
    except Exception as e:
        print(f"Error loading data: {e}")
        return None

In [None]:
# Load your dataset (replace with your actual file path)
file_path = "sales_data.csv"  # or use "global_superstore.csv"
df = load_and_clean_data(file_path)

## 2. Exploratory Data Analysis (EDA)

In [None]:
def perform_eda(df):
    """Perform comprehensive exploratory data analysis"""
    print(f"\n{'='*50}\nExploratory Data Analysis\n{'='*50}")
    
    # Descriptive statistics
    print("\nDescriptive Statistics:")
    display(df.describe().T)
    
    # Correlation matrix
    print("\nCorrelation Matrix:")
    plt.figure(figsize=(12, 8))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0, fmt='.2f')
    plt.title('Correlation Heatmap')
    plt.show()
    
    # Distributions of numerical features
    print("\nNumerical Features Distribution:")
    num_cols = df.select_dtypes(include=[np.number]).columns
    df[num_cols].hist(bins=20, figsize=(15, 10), layout=(3, 3))
    plt.tight_layout()
    plt.show()
    
    # Categorical analysis
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    if len(cat_cols) > 0:
        print("\nCategorical Features Analysis:")
        for col in cat_cols:
            print(f"\nUnique values in {col}: {df[col].nunique()}")
            if df[col].nunique() < 15:  # Only plot for columns with limited categories
                plt.figure(figsize=(10, 4))
                sns.countplot(y=col, data=df, order=df[col].value_counts().index)
                plt.title(f'Distribution of {col}')
                plt.show()

In [None]:
# Perform EDA
if df is not None:
    perform_eda(df)

## 3. Sales Performance Analysis

In [None]:
def analyze_sales(df, target_col='Sales'):
    """Analyze sales performance and build predictive model"""
    print(f"\n{'='*50}\nSales Performance Analysis\n{'='*50}")
    
    # Time series analysis if date column exists
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'])
        df['YearMonth'] = df['Date'].dt.to_period('M')
        
        # Monthly sales trend
        monthly_sales = df.groupby('YearMonth')[target_col].sum().reset_index()
        monthly_sales['YearMonth'] = monthly_sales['YearMonth'].astype(str)
        
        plt.figure(figsize=(15, 6))
        sns.lineplot(x='YearMonth', y=target_col, data=monthly_sales, marker='o')
        plt.xticks(rotation=45)
        plt.title('Monthly Sales Trend')
        plt.ylabel('Total Sales')
        plt.xlabel('Month')
        plt.grid(True)
        plt.show()
    
    # Sales by categorical features
    cat_cols = [col for col in df.columns if df[col].dtype == 'object' and df[col].nunique() < 10]
    
    for col in cat_cols:
        plt.figure(figsize=(10, 5))
        sns.barplot(x=col, y=target_col, data=df, estimator=np.sum)
        plt.title(f'Total Sales by {col}')
        plt.xticks(rotation=45)
        plt.show()
    
    # Prepare data for modeling
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    X = df[numeric_cols].drop(target_col, axis=1, errors='ignore')
    y = df[target_col]
    
    if len(X.columns) == 0:
        print("No suitable numerical features found for modeling")
        return None
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Linear regression model
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    # Evaluate model
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n{'='*50}\nModel Evaluation\n{'='*50}")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R-squared: {r2:.2f}")
    
    # Feature importance
    importance = pd.DataFrame({
        'Feature': X.columns,
        'Coefficient': model.coef_
    }).sort_values('Coefficient', ascending=False)
    
    print(f"\n{'='*50}\nFeature Importance\n{'='*50}")
    display(importance)
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Coefficient', y='Feature', data=importance)
    plt.title('Feature Importance (Linear Regression Coefficients)')
    plt.show()
    
    return model

In [None]:
# Analyze sales performance if data is loaded
if df is not None:
    model = analyze_sales(df)

## 4. Summary and Next Steps

In [None]:
print(f"\n{'='*50}\nAnalysis Summary\n{'='*50}")
print("""
1. Data loaded and cleaned (missing values, duplicates, outliers handled)
2. Comprehensive EDA performed (statistics, correlations, distributions)
3. Sales performance analyzed through:
   - Time trends (if date available)
   - Category breakdowns
4. Predictive model built with evaluation metrics

Next Steps:
- Feature engineering to improve model accuracy
- Try more advanced models (Random Forest, XGBoost)
- Business recommendations based on insights
""")