In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

def load_data(dataset_path):
    """Load dataset from the provided path."""
    return pd.read_csv(dataset_path)

def clean_data(df):
    """Clean the dataset by handling missing values."""
    df = df.dropna()  # Drop rows with missing values
    # Alternatively, you could fill missing values with df.fillna()
    return df

def encode_categorical(df, columns):
    """Encode categorical columns to numeric."""
    df = pd.get_dummies(df, columns=columns, drop_first=True)
    return df

def scale_features(df, features):
    """Standardize the numeric features."""
    scaler = StandardScaler()
    df[features] = scaler.fit_transform(df[features])
    return df

def split_data(df, target, test_size=0.2):
    """Split data into training and testing sets."""
    X = df.drop(target, axis=1)
    y = df[target]
    return train_test_split(X, y, test_size=test_size, random_state=42)

In [10]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def plot_correlation_matrix(df):
    """Plot the correlation matrix for numerical features."""
    corr_matrix = df.corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt='.2f')
    plt.title("Correlation Matrix")
    plt.show()

def plot_feature_distribution(df, feature):
    """Plot distribution of a specific feature."""
    plt.figure(figsize=(8, 6))
    sns.histplot(df[feature], kde=True)
    plt.title(f'Distribution of {feature}')
    plt.show()

def plot_pairplot(df):
    """Plot pairplot of numerical features to examine relationships."""
    sns.pairplot(df)
    plt.show()

In [11]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def train_logistic_regression(X_train, y_train, X_test, y_test):
    """Train and evaluate Logistic Regression."""
    model = LogisticRegression(max_iter=200)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def train_decision_tree(X_train, y_train, X_test, y_test):
    """Train and evaluate Decision Tree Classifier."""
    model = DecisionTreeClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

def train_random_forest(X_train, y_train, X_test, y_test):
    """Train and evaluate Random Forest Classifier."""
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [12]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def kmeans_clustering(df, num_clusters=3):
    """Perform KMeans clustering on the dataset."""
    model = KMeans(n_clusters=num_clusters, random_state=42)
    df['Cluster'] = model.fit_predict(df)
    return df

def plot_clusters(df, x_col, y_col):
    """Plot the clusters in 2D space."""
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=df[x_col], y=df[y_col], hue=df['Cluster'], palette='Set2')
    plt.title(f'KMeans Clustering ({x_col} vs {y_col})')
    plt.show()

In [13]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

def load_transactions_data(file_path):
    """Load supermarket transactions data."""
    return pd.read_csv(file_path, header=None)

def encode_transactions(df):
    """Convert data into one-hot encoded format."""
    df_encoded = df.stack().str.get_dummies().sum(level=0)
    return df_encoded

def mine_association_rules(df_encoded, min_support=0.1, min_threshold=0.5):
    """Mine association rules using Apriori Algorithm."""
    frequent_itemsets = apriori(df_encoded, min_support=min_support, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=min_threshold)
    return rules