### Install and Import Third Party Dependencies

In [None]:
%pip install pandas numpy matplotlib seaborn scipy scikit-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

### Initialize datasets and columns for outlier detection

In [None]:
# Load dataset (update the file path if needed)
df = pd.read_csv("data.csv")

# Select numeric columns for outlier detection
df_numeric = df.select_dtypes(include='number')

# Add the 'title' column to the numeric DataFrame
# df_numeric = pd.concat([df[['title']], df_numeric], axis=1)

# Display dataset info and summary
print(df.info())
print(df.describe())


### Outliers Detected using Z-score

In [None]:
df_z = df_numeric

# Calculate the mean and standard deviation for 'imdbNumVotes'
mean_votes = df_z['imdbNumVotes'].mean()
std_votes = df_z['imdbNumVotes'].std()

# Calculate the Z-scores
df_z['z_score'] = (df_z['imdbNumVotes'] - mean_votes) / std_votes

# Define the threshold for identifying outliers
threshold = 3

# Identify outliers
df_z['is_outlier'] = np.abs(df_z['z_score']) > threshold

# Display the outliers
outliers = df_z[df_z['is_outlier']]

print("Outliers detected using Z-score:")
print(outliers)

### Z-Score Outliers Visualization using Box Plot

In [None]:
# Create a box plot for 'imdbNumVotes'
sns.boxplot(y=df_z['imdbNumVotes'], color='lightblue')

# Overlay outliers identified by Z-score
outliers = df_z[df_z['is_outlier']]
sns.scatterplot(y=outliers['imdbNumVotes'], x=[0]*len(outliers), color='red', label='Z-score Outliers', zorder=10)

plt.title('Box Plot of IMDb Number of Votes with Z-score Outliers')
plt.ylabel('IMDb Number of Votes')
plt.legend()
plt.show()

### Outliers Detected using IQR

In [None]:
df_iqr = df_numeric

# Calculate Q1 (25th percentile) and Q3 (75th percentile) for imdbNumVotes
Q1 = df_iqr['imdbNumVotes'].quantile(0.25)
Q3 = df_iqr['imdbNumVotes'].quantile(0.75)
IQR = Q3 - Q1

# Define outlier boundaries as scalar values
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
df_iqr['is_outlier'] = (df_iqr['imdbNumVotes'] < lower_bound) | (df_iqr['imdbNumVotes'] > upper_bound)

# Extract rows with outliers
df_outliers_iqr = df[df_iqr['is_outlier']]

print("\nOutliers detected using IQR:")
print(df_outliers_iqr)

### IQR Outliers Visualization using Scatter Plot

In [None]:

# Scatter Plot with Outliers Highlighted
# Scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=df_iqr,
    x='imdbNumVotes', y='imdbAverageRating',
    hue='is_outlier',  # Color by outlier status
    palette={True: 'red', False: 'blue'},  # Red for outliers, blue for non-outliers
    legend='brief'
)

# Highlight outliers with annotations
for i in range(len(df_iqr)):
    if df_iqr['is_outlier'].iloc[i]:
        plt.text(
            df_iqr['imdbNumVotes'].iloc[i],
            df_iqr['imdbAverageRating'].iloc[i],
            s="", 
            fontsize=9,
            ha='right'
        )

plt.title('IMDb Ratings vs. Number of Votes (Outliers Highlighted)')
plt.xlabel('Number of Votes')
plt.ylabel('Average Rating')
plt.legend(title='Outlier', labels=['No', 'Yes'])
plt.show()

### Detecting outliers using Isolation Method

In [None]:
x = df_numeric[["imdbNumVotes", "imdbAverageRating"]]

In [None]:
from sklearn.ensemble import IsolationForest

df_iso_forest = df_numeric

iso_forest = IsolationForest(contamination=0.05, random_state=42)
df_iso_forest['anomaly_score'] = iso_forest.fit_predict(df_iso_forest[['imdbNumVotes', 'imdbAverageRating']])

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_iso_forest, x='imdbNumVotes', y='imdbAverageRating', hue='anomaly_score', palette=['red', 'blue'])
plt.title('Isolation Forest Outlier Detection')
plt.show()



### Comparing Count using Statistical Method and Machine Learning Method

In [None]:
# Count iqr outliers
outlier_count_iqr = df_iqr['is_outlier'].sum()
print(f'Number of outliers detected using IQR method: {outlier_count_iqr}')

# Count the number of outliers
outlier_count_iso = (df_iso_forest['anomaly_score'] == -1).sum()
print(f'Number of outliers detected using Isolation Forest: {outlier_count_iso}')


### Cleaned outliers using the IQR Method

In [None]:
# Assuming df_numeric is your DataFrame containing numerical data
df_cleaned = df_numeric.copy()

# Remove outliers
df_cleaned = df_cleaned[~((df_cleaned < lower_bound) | (df_cleaned > upper_bound)).any(axis=1)]
print(f'Number of data points after removing outliers: {df_cleaned.shape[0]}')

### Performing Winsorization

In [None]:
from scipy.stats.mstats import winsorize

# Assuming df_numeric is your DataFrame containing numerical data
df_winsorized = df_numeric.copy()

# Apply Winsorization to each column (e.g., limit data to the 5th and 95th percentiles)
for column in df_winsorized.columns:
    df_winsorized[column] = winsorize(df_winsorized[column], limits=[0.05, 0.05])

print(f'Winsorized data:\n{df_winsorized.describe()}')


### Log Transformation

In [None]:
# Assuming df_numeric is your DataFrame containing numerical data
df_log_transformed = df_numeric.copy()

# Apply log transformation to each column
df_log_transformed = np.log(df_log_transformed + 1)  # Adding 1 to avoid log(0)

print(f'Log-transformed data:\n{df_log_transformed.describe()}')

### Comparing Methods

In [None]:
# Original data statistics
print('Original Data Statistics:')
print(df_numeric.describe())

# After removing outliers
print('\nAfter Removing Outliers:')
print(df_cleaned.describe())

# After Winsorization
print('\nAfter Winsorization:')
print(df_winsorized.describe())

# After Log Transformation
print('\nAfter Log Transformation:')
print(df_log_transformed.describe())


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

def train_model(df):
    # Initialize the imputer with the desired strategy, to clear null values
    imputer = SimpleImputer(strategy='mean')
    df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

    # Define features and target
    X = df_imputed
    y = (X['imdbAverageRating'] * X['imdbNumVotes']) / 1000  # Example target

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the model
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    print(f'Mean Squared Error: {mse}')

### Train Model with outliers

In [None]:
# Select numeric columns
df_numeric = df[['imdbAverageRating', 'imdbNumVotes']]
train_model(df_numeric)

### Training Machine Learning Model after outliers handling

In [None]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile)
Q1 = df_numeric.quantile(0.25)
Q3 = df_numeric.quantile(0.75)
IQR = Q3 - Q1

# Define outlier criteria
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
df_no_outliers = df_numeric[~((df_numeric < lower_bound) | (df_numeric > upper_bound)).any(axis=1)]
train_model(df_no_outliers)

### Result Summarization

Based on the results on training a model with and without outliers, it has been shown that dropping outliers help prevent skewed out data and helps with normalization

**How do different outlier detection techniques impact the identification of anomalies in the datasets?**

Different outlier detection techniques have varying sensitivity and more complex outlier detection techniques cost more processing power choosing the right outlier detection technique depends on the data that is being analyzed to make sure that correct anomalies are actually being detected and avoiding false positives.

**How does the presence of outliers affect the performance of different machine learning models?**

Outliers are anomalies that are included in a set of data. These anomalies can cause data to be skewed, which might affect machine learning models that are sensitive to extreme values. These outliers can cause the model to either overfit or underfit

**How does the presence of outliers affect the performance of different machine learning models?**

Models, especially those that are sensitive to distribution of data might be affected by large anomalies cause by outliers, they can increase the rate of error of a machine learning model and reduce its accuracy

**What are the potential drawbacks of removing outliers from a datasets?**

Some of the drawback of removing outliers are the following:
- Loss of important data
- Overfitting

**If you were working with a real-word datasets, what factors would you consider before deciding to remove outliers?**

Some things to consider are the following:
- Importance of data distribution
- The target or goal of the machine learning model
- Impacts of removing or retaining outliers in the dataset

