<a href="https://colab.research.google.com/github/Sayed-Hossein-Hosseini/Dancing_with_Polynomial_Predictions/blob/master/Dancing_with_Polynomial_Predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Dancing with Polynomial Predictions**

## **Libraries**

In [5]:
import gdown
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

warnings.simplefilter(action='ignore', category=FutureWarning)

## **Dataset Description**

In [2]:
# Load the Excel file
file_path = 'Polynomial_Functions.xlsx'  # Change the path if needed
df = pd.read_excel(file_path, sheet_name='Sheet1')

# Display general information about the dataset
print("🔹 Dataset Information:")
print(df.info())

# Check for missing values in each column
print("\n🔹 Missing Values Per Column:")
print(df.isnull().sum())

# Show descriptive statistics
print("\n🔹 Descriptive Statistics:")
print(df.describe())

# Count unique values in each column
print("\n🔹 Number of Unique Values Per Column:")
print(df.nunique())


🔹 Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   x           10000 non-null  float64
 1   y           10000 non-null  float64
 2   z           10000 non-null  float64
 3   F(x, y, z)  10000 non-null  float64
dtypes: float64(4)
memory usage: 312.6 KB
None

🔹 Missing Values Per Column:
x             0
y             0
z             0
F(x, y, z)    0
dtype: int64

🔹 Descriptive Statistics:
                  x             y             z    F(x, y, z)
count  10000.000000  10000.000000  10000.000000  10000.000000
mean      -0.116809      0.090598      0.001008     43.803057
std        5.752603      5.785891      5.735475    587.941231
min       -9.999767     -9.996845     -9.999038  -3191.680304
25%       -5.073423     -4.921084     -4.925042   -225.322605
50%       -0.149428      0.117936      0.041362     15.622764
75%      

## **Train/Test Split**

In [6]:
# Separate features and target
X = df.drop('F(x, y, z)', axis=1)
y = df['F(x, y, z)']

# Split into training and test sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the sizes
print(f"Training set size: {X_train.shape[0]} rows")
print(f"Test set size: {X_test.shape[0]} rows")


Training set size: 8000 rows
Test set size: 2000 rows


## **Preprocessing**

### **Remove Outliers**

In [8]:
def remove_outliers_auto_xy_with_output(x, y):
    """
    Automatically detect and remove outliers from the training data (x, y) using the IQR method.
    Removes rows from both x and y simultaneously when an outlier is detected in either x or y.

    Parameters:
        x (pd.DataFrame): Input feature DataFrame (training data)
        y (pd.Series): Input target Series (labels)

    Returns:
        pd.DataFrame, pd.Series: Cleaned feature DataFrame and target Series with outliers removed
    """
    initial_shape = x.shape
    print(f"🔹 Initial dataset size: {initial_shape[0]} rows, {initial_shape[1]} columns (x), {y.shape[0]} rows (y)\n")

    # For numeric columns in x
    numeric_cols_x = x.select_dtypes(include='number').columns
    x_clean = x.copy()
    y_clean = y.copy()

    # Remove outliers from x
    for col in numeric_cols_x:
        Q1 = x[col].quantile(0.25)
        Q3 = x[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        before = x_clean.shape[0]
        mask = (x_clean[col] >= lower_bound) & (x_clean[col] <= upper_bound)

        # Apply the same mask to both x and y
        x_clean = x_clean[mask]
        y_clean = y_clean[mask]

        after = x_clean.shape[0]
        removed = before - after

        if removed > 0:
            print(f"🟠 Removed {removed} outliers from column '{col}' in x")
        else:
            print(f"✅ No outliers detected in column '{col}' in x")

    # For the target variable y
    Q1_y = y_clean.quantile(0.25)
    Q3_y = y_clean.quantile(0.75)
    IQR_y = Q3_y - Q1_y
    lower_bound_y = Q1_y - 1.5 * IQR_y
    upper_bound_y = Q3_y + 1.5 * IQR_y

    before_y = y_clean.shape[0]
    y_mask = (y_clean >= lower_bound_y) & (y_clean <= upper_bound_y)

    # Apply the same mask to both x and y
    x_clean = x_clean[y_mask]
    y_clean = y_clean[y_mask]

    after_y = x_clean.shape[0]
    removed_y = before_y - after_y

    if removed_y > 0:
        print(f"🟠 Removed {removed_y} outliers from target variable y")
    else:
        print(f"✅ No outliers detected in target variable y")

    print(f"\n✅ Final dataset size: {x_clean.shape[0]} rows (removed {initial_shape[0] - x_clean.shape[0]} total rows)")
    return x_clean, y_clean

# Example usage:
X_train_clean, y_train_clean = remove_outliers_auto_xy_with_output(X_train, y_train)


🔹 Initial dataset size: 8000 rows, 3 columns (x), 8000 rows (y)

✅ No outliers detected in column 'x' in x
✅ No outliers detected in column 'y' in x
✅ No outliers detected in column 'z' in x
🟠 Removed 603 outliers from target variable y

✅ Final dataset size: 7397 rows (removed 603 total rows)
