In [11]:
import os
import re
import pandas as pd
import numpy as np
import duckdb
import unicodedata
from tqdm import tqdm
from textblob import TextBlob
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import MultiLabelBinarizer  # One-hot encoding
from scipy.stats import trim_mean
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import time
from imdb import IMDb

In [12]:
def preprocess_runtime(df, k=0.1):
    """
    Preprocesses the runtime information, including handling extreme values
    (movies with runtime > 1000 minutes, possibly in seconds or hours).

    Arguments:
    - df: DataFrame containing the movie data.
    - k: Proportion of values to trim from each end when computing the trimmed mean.

    Returns:
    - DataFrame with cleaned runtime information.
    """
    # First convert to numeric, coercing errors to NaN
    df["runtimeMinutes"] = pd.to_numeric(df["runtimeMinutes"], errors='coerce')

    # Compute k-trimmed mean for runtimeMinutes (ignoring NaN values)
    trimmed_mean_runtime = trim_mean(df["runtimeMinutes"].dropna().values, proportiontocut=k)
    print(trimmed_mean_runtime)

    # Fill missing values with trimmed mean
    df["runtimeMinutes"] = df["runtimeMinutes"].fillna(trimmed_mean_runtime)

    # Handle movies that are possibly in seconds or hours
    df["runtimeMinutes"] = df["runtimeMinutes"].apply(lambda x: x / 60 if pd.notna(x) and x > 1000 else x)  # Convert seconds to minutes
    df["runtimeMinutes"] = df["runtimeMinutes"].apply(lambda x: x * 60 if pd.notna(x) and x < 5 else x)  # Convert minutes to hours if under 5 mins

    # Now convert to int (after handling extreme values and filling NaNs)
    df["runtimeMinutes"] = df["runtimeMinutes"].round().astype(int)

    return df

In [13]:
# Initialize IMDbPY
ia = IMDb()

def fetch_movie_genre(title, genre_cache):
    """
    Fetch movie genre using IMDbPY.
    
    Arguments:
    - title: Cleaned movie title.
    - genre_cache: Dictionary to store already fetched genres to avoid duplicate API calls.
    
    Returns:
    - List of genres if found, otherwise ["Unknown"].
    """
    if title in genre_cache:  # Check cache first
        return genre_cache[title]

    try:
        movies = ia.search_movie(title)
        if movies:
            movie = movies[0]  # Get the first search result
            ia.update(movie)  # Fetch full details
            genre_list = movie.get('genres', ["Unknown"])
            genre_cache[title] = genre_list  # Store in cache as a list
            return genre_list
    except Exception as e:
        print(f"Error fetching genre for {title}: {e}")
    
    genre_cache[title] = ["Unknown"]
    return ["Unknown"]

In [14]:
def preprocess_imdb_data(data_path, directors_path, writers_path):
    """
    General preprocessing pipeline for IMDB data.
    
    Arguments:
    - data_path: Path to the train/test/validation data CSV file.
    - directors_path: Path to the directing.json file.
    - writers_path: Path to the writing.json file.
    
    Returns:
    - Cleaned Pandas DataFrame ready for model training or prediction.
    """
    
    # Step 1: Load main dataset
    df = pd.read_csv(data_path)

    # Step 2: Load JSON files (Directors & Writers)
    df_directors = pd.read_json(directors_path)
    df_writers = pd.read_json(writers_path)

    # Step 3: Rename columns for consistency
    df_directors.rename(columns={"movie": "tconst", "director": "director_id"}, inplace=True)
    df_writers.rename(columns={"movie": "tconst", "writer": "writer_id"}, inplace=True)

    # Step 4: Convert nested JSON fields into strings
    df_directors["director_id"] = df_directors["director_id"].astype(str)
    df_writers["writer_id"] = df_writers["writer_id"].astype(str)

    # Step 5: Merge main dataset with Directors & Writers using DuckDB
    con = duckdb.connect()
    con.register("movies", df)
    con.register("directors", df_directors)
    con.register("writers", df_writers)

    query = """
    SELECT 
        movies.*, 
        directors.director_id, 
        writers.writer_id
    FROM movies
    LEFT JOIN directors ON movies.tconst = directors.tconst
    LEFT JOIN writers ON movies.tconst = writers.tconst
    """

    df = con.execute(query).fetchdf()
    con.close()

    # Step 6: Create column year from startYear and endYear
    df['startYear'] = df['startYear'].replace('\\N', np.nan).astype(float)
    df['endYear'] = df['endYear'].replace('\\N', np.nan).astype(float)
    df['Year'] = df['startYear'].fillna(df['endYear'])

    # Step 7: Clean title names
    def normalize_text(text):
        if pd.isna(text):  # Handle missing values
            return ""
        text = str(text)
        text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')  # Remove accents
        text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
        return text.strip()

    def clean_titles(row):
        primary = row['primaryTitle'] if pd.notna(row['primaryTitle']) else ''
        original = row['originalTitle'] if pd.notna(row['originalTitle']) else ''

        if not primary:
            primary = original

        cleaned_title = normalize_text(primary)

        return cleaned_title if cleaned_title else "Unknown Title"

    df['primaryTitle'] = df.apply(clean_titles, axis=1)
    df.rename(columns={'primaryTitle': 'movieTitle'}, inplace=True)

    # Step 8: Compute Title Uniqueness Score
    title_counts = df["movieTitle"].value_counts()
    df["title_uniqueness"] = df["movieTitle"].apply(lambda x: 1 / title_counts[x] if title_counts[x] > 1 else 1)

    # Step 9: Compute Sentiment Score
    df["sentiment_score"] = df["movieTitle"].astype(str).apply(lambda x: TextBlob(x).sentiment.polarity)

    # Step 10: Count words in each title
    df["word_count"] = df["movieTitle"].apply(lambda x: len(x.split()))

    # Step 11: Compute title length standard deviation
    df["title_word_length_std"] = df["movieTitle"].apply(lambda x: np.std([len(word) for word in x.split()]) if len(x.split()) > 1 else 0)
    
    # Step 12: Fetch Movie Genre using IMDbPY (Parallelized)
    genre_cache = {}  # Dictionary to cache fetched genres
    unique_titles = df["movieTitle"].unique()  # Get unique movie titles

    # Parallel processing to fetch genres faster
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        genres = list(executor.map(lambda title: fetch_movie_genre(title, genre_cache), unique_titles))

    # Map fetched genres back to the dataframe
    genre_map = dict(zip(unique_titles, genres))
    df["genre"] = df["movieTitle"].map(genre_map)

    # Step 13: One-Hot Encode Genres
    mlb = MultiLabelBinarizer()
    genre_encoded = mlb.fit_transform(df["genre"])  # Convert list of genres into binary matrix

    # Create DataFrame with one-hot encoded genre columns
    genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_, index=df.index)

    # Merge with main DataFrame
    df = pd.concat([df, genre_df], axis=1)

    # Step 14: Drop unnecessary columns
    columns_to_drop = ["originalTitle", "endYear", "startYear", "Unnamed: 0", "genre"]
    df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)

    # Step 15: Handle missing values
    df = preprocess_runtime(df, 0.1)

    # Step 16: Fill missing values for numVotes
    trimmed_mean_votes = trim_mean(df["numVotes"].dropna(), proportiontocut=0.1)
    df["numVotes"] = df["numVotes"].fillna(trimmed_mean_votes)

    # Step 17: Fill missing values for director_id and writer_id
    df["director_id"] = df["director_id"].fillna("unknown")
    df["writer_id"] = df["writer_id"].fillna("unknown")

    # Step 18: Ensure correct data types
    df["Year"] = df["Year"].astype(int)
    df["numVotes"] = df["numVotes"].astype(int)

    # Step 19: Ensure each `tconst` is unique
    df = df.groupby("tconst").first().reset_index()
    
    return df

In [None]:
# Define file paths
# Define the base directory
base_data_dir = os.path.join(os.getcwd(), "imdb")

# Generate the list of train file paths
train_files = [os.path.join(base_data_dir, f) for f in os.listdir(base_data_dir) if f.startswith("train-") and f.endswith(".csv")]

# Define paths for directors and writers files
directors_path = os.path.join(base_data_dir, "directing.json")
writers_path = os.path.join(base_data_dir, "writing.json")

# Load JSON files (Directors & Writers)
df_directors = pd.read_json(directors_path)
df_writers = pd.read_json(writers_path)

# Preprocess and merge all training data
df_train = pd.concat([preprocess_imdb_data(file, directors_path, writers_path) for file in train_files], ignore_index=True)

# Preprocess validation and test data
df_val = preprocess_imdb_data(os.path.join(base_data_dir, "validation_hidden.csv"), directors_path, writers_path)
df_test = preprocess_imdb_data(os.path.join(base_data_dir, "test_hidden.csv"), directors_path, writers_path)

# Save cleaned datasets
df_train.to_csv("cleaned/final_training_data_genre.csv", index=False)
df_val.to_csv("cleaned/final_validation_data_genre.csv", index=False)
df_test.to_csv("cleaned/final_test_data_genre.csv", index=False)

print("\n✅ All datasets have been preprocessed and saved!")

103.14328808446456


2025-03-19 12:30:48,799 CRITICAL [imdbpy] C:\Users\Gebruiker\Documents\UVA\Vakken met code\BD\BigData-Group10\BD_env\Lib\site-packages\imdb\_exceptions.py:32: IMDbDataAccessError exception raised; args: ({'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=1&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': <HTTPError 500: 'Internal Server Error'>},); kwds: {}
Traceback (most recent call last):
  File "C:\Users\Gebruiker\Documents\UVA\Vakken met code\BD\BigData-Group10\BD_env\Lib\site-packages\imdb\parser\http\__init__.py", line 233, in retrieve_unicode
    response = uopener.open(url)
  File "C:\Users\Gebruiker\AppData\Local\Programs\Python\Python313\Lib\urllib\request.py", line 495, in open
    response = meth(req, response)
  File "C:\Users\Gebruiker\AppData\Local\Programs\Python\Python313\Lib\urllib\request.py", line 604, in http_response
    response = self.parent.error(
        'http', request, response, code, msg, hdrs)
  File "C:\Users\G

Error fetching genre for 1: {'errcode': None, 'errmsg': 'None', 'url': 'https://www.imdb.com/find/?q=1&s=tt', 'proxy': '', 'exception type': 'IOError', 'original exception': <HTTPError 500: 'Internal Server Error'>}
102.62376237623762
109.99336650082918


In [None]:
def calculate_unique_ratio(df, columns=None):
    """
    Calculate the ratio of unique rows to total rows in the DataFrame.

    Arguments:
    - df: DataFrame to analyze
    - columns: List of columns to consider (if None, uses all columns)

    Returns:
    - Dictionary containing unique ratio metrics
    """
    if columns is None:
        columns = df.columns

    total_rows = len(df)
    unique_rows = len(df[columns].drop_duplicates())
    ratio = unique_rows / total_rows

    metrics = {
        "total_rows": total_rows,
        "unique_rows": unique_rows,
        "unique_ratio": ratio
    }

    return metrics

# Add this after your data preprocessing
print("\n🔍 Analyzing unique row ratios...")

# Calculate ratios for all datasets
train_metrics = calculate_unique_ratio(df_train)
val_metrics = calculate_unique_ratio(df_val)
test_metrics = calculate_unique_ratio(df_test)

# Print results
print("\nUnique Row Analysis:")
print(f"Training Data:")
print(f"  - Total Rows: {train_metrics['total_rows']:,}")
print(f"  - Unique Rows: {train_metrics['unique_rows']:,}")
print(f"  - Unique Ratio: {train_metrics['unique_ratio']:.2%}")

print(f"\nValidation Data:")
print(f"  - Total Rows: {val_metrics['total_rows']:,}")
print(f"  - Unique Rows: {val_metrics['unique_rows']:,}")
print(f"  - Unique Ratio: {val_metrics['unique_ratio']:.2%}")

print(f"\nTest Data:")
print(f"  - Total Rows: {test_metrics['total_rows']:,}")
print(f"  - Unique Rows: {test_metrics['unique_rows']:,}")
print(f"  - Unique Ratio: {test_metrics['unique_ratio']:.2%}")

In [None]:
def handle_duplicates(df, groupby_cols=None, agg_strategy=None):
    """
    Handle duplicate rows using specified aggregation strategies.

    Arguments:
    - df: DataFrame to process
    - groupby_cols: List of columns to identify duplicates (default: all columns except label)
    - agg_strategy: Dictionary of column names and aggregation functions

    Returns:
    - DataFrame with handled duplicates
    """
    if groupby_cols is None:
        groupby_cols = [col for col in df.columns if col != 'label']

    if agg_strategy is None:
        agg_strategy = {
            'runtimeMinutes': 'mean',
            'numVotes': 'sum',
            'startYear': 'first',
            'director_id': 'first',
            'writer_id': 'first',
            'label': 'mode'
        }

    # Count occurrences before deduplication
    total_rows = len(df)
    unique_rows = len(df[groupby_cols].drop_duplicates())

    if total_rows == unique_rows:
        print("No duplicates found!")
        return df

    print(f"\nFound {total_rows - unique_rows:,} duplicate rows")
    print(f"Unique ratio before: {(unique_rows/total_rows):.2%}")

    # Handle duplicates using aggregation
    df_cleaned = df.groupby(groupby_cols, as_index=False).agg(agg_strategy)

    # Print results
    print(f"Rows after duplicate handling: {len(df_cleaned):,}")
    print(f"Unique ratio after: {(len(df_cleaned)/total_rows):.2%}")

    return df_cleaned

# Handle duplicates
print("\n🔍 Handling duplicates...")
df_train = handle_duplicates(df_train)
df_val = handle_duplicates(df_val)
df_test = handle_duplicates(df_test)

In [7]:
# # Define Features & Target
# features = ["Year", "runtimeMinutes", "numVotes", "director_id", "writer_id", "word_count"]
# X_train = df_train[features]
# y_train = df_train["label"]
# X_val = df_val[features]
# X_test = df_test[features]

# # Preprocessing Pipeline
# numeric_features = ["Year", "runtimeMinutes", "numVotes", "word_count"]
# categorical_features = ["director_id", "writer_id"]

# preprocessor = ColumnTransformer(
#     transformers=[
#         ("num", StandardScaler(), numeric_features),
#         ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
#     ]
# )

# # Train Logistic Regression Model
# model = Pipeline([
#     ("preprocessing", preprocessor),
#     ("classifier", SVC(kernel="linear", probability=True))
# ])

# print("🔹 Training model on full training data...")
# model.fit(X_train, y_train)

# # Generate Predictions
# y_val_pred = model.predict(X_val)
# y_test_pred = model.predict(X_test)

# # Save predictions in required format (no headers, single column)
# pd.DataFrame(y_val_pred).to_csv("submissions/validation_predictions_SVM.csv", index=False, header=False)
# pd.DataFrame(y_test_pred).to_csv("submissions/test_predictions_SVM.csv", index=False, header=False)

# print("✅ Predictions saved for submission!")

In [8]:
# Load preprocessed training dataset
df_train = pd.read_csv("cleaned/final_training_data_titlefeatures.csv")

# Apply frequency encoding to categorical variables
for col in ["director_id", "writer_id"]:
    freq_encoding = df_train[col].value_counts(normalize=True)
    df_train[col] = df_train[col].map(freq_encoding)

# Define Features & Target
features = ["Year", "runtimeMinutes", "numVotes", "director_id", "writer_id", "word_count", "title_uniqueness", "title_word_length_std", "sentiment_score"]
X = df_train[features]
y = df_train["label"]  # Only train data has labels

# **NEW: Split training data into train (80%) and validation (20%)**
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing Pipeline (same for all models)
numeric_features = ["Year", "runtimeMinutes", "numVotes", "word_count", "title_uniqueness", "title_word_length_std", "sentiment_score"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
    ]
)
# Create pipeline with preprocessing
pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", SVC(kernel="linear", probability=True))
])

# Train model
pipeline.fit(X_train, y_train)

# Make predictions
y_train_pred = pipeline.predict(X_train)
y_val_pred = pipeline.predict(X_val)

# Evaluate model
train_accuracy = accuracy_score(y_train, y_train_pred)
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"✅ Training Accuracy: {train_accuracy:.4f}")
print(f"✅ Validation Accuracy: {val_accuracy:.4f}")
print(f"📊 Classification Report for:\n", classification_report(y_val, y_val_pred))


✅ Training Accuracy: 0.7140
✅ Validation Accuracy: 0.7142
📊 Classification Report for:
               precision    recall  f1-score   support

       False       0.68      0.80      0.74       799
        True       0.76      0.63      0.69       793

    accuracy                           0.71      1592
   macro avg       0.72      0.71      0.71      1592
weighted avg       0.72      0.71      0.71      1592



In [10]:
# Load preprocessed training dataset
df_train = pd.read_csv("cleaned/final_training_data_titlefeatures.csv")

# Apply frequency encoding to categorical variables
for col in ["director_id", "writer_id"]:
    freq_encoding = df_train[col].value_counts(normalize=True)
    df_train[col] = df_train[col].map(freq_encoding)

# Define Features & Target
features = ["Year", "runtimeMinutes", "numVotes", "director_id", "writer_id", "word_count",
           "title_uniqueness", "title_word_length_std", "sentiment_score"]
X = df_train[features]
y = df_train["label"]  # Only train data has labels

# Split training data into train (80%) and validation (20%)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing Pipeline
numeric_features = ["Year", "runtimeMinutes", "numVotes", "word_count",
                   "title_uniqueness", "title_word_length_std", "sentiment_score"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
    ]
)

# Create base pipeline with preprocessing
pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", None)  # Placeholder for classifier
])


classifiers = {
    "LogisticRegression": {
        "classifier": [LogisticRegression(max_iter=1000, random_state=42)],
        "classifier__C": [0.1, 1.0, 10.0],
        "classifier__solver": ["liblinear", "lbfgs"]
    },
    "SVC": {
        "classifier": [SVC(probability=True, random_state=42)],
        "classifier__C": [0.1, 1.0, 10.0],
        "classifier__kernel": ["linear", "rbf"]
    },
    "RandomForest": {
        "classifier": [RandomForestClassifier(random_state=42)],
        "classifier__n_estimators": [100, 200],
        "classifier__max_depth": [None, 20]
    },
    "GradientBoosting": {
        "classifier": [GradientBoostingClassifier(random_state=42)],
        "classifier__n_estimators": [100, 200],
        "classifier__learning_rate": [0.01, 0.1]
    },
    "AdaBoost": {
        "classifier": [AdaBoostClassifier(random_state=42)],
        "classifier__n_estimators": [50, 100, 200],
        "classifier__learning_rate": [0.01, 0.1, 1.0]
    }
}

# Perform GridSearch for each classifier
results = {}
print("🔍 Starting grid search across classifiers...")

for name, param_grid in classifiers.items():
    print(f"\nEvaluating {name}...")
    start_time = time.time()

    grid = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        scoring="accuracy",
        n_jobs=-1
    )

    # Train model
    grid.fit(X_train, y_train)

    # Get best model
    best_model = grid.best_estimator_

    # Evaluate on validation set
    y_val_pred = best_model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)

    # Store results
    results[name] = {
        "best_model": best_model,
        "best_params": grid.best_params_,
        "best_cv_score": grid.best_score_,
        "val_accuracy": val_accuracy,
        "time": time.time() - start_time
    }

    print(f"✅ Best parameters: {grid.best_params_}")
    print(f"✅ Cross-validation accuracy: {grid.best_score_:.4f}")
    print(f"✅ Validation accuracy: {val_accuracy:.4f}")
    print(f"⏱️ Time: {results[name]['time']:.2f} seconds")

# Print summary table sorted by validation accuracy
print("\n📊 Summary of Results (sorted by validation accuracy):")
print(f"{'Classifier':<20} {'Val Accuracy':<15} {'CV Accuracy':<15} {'Time (s)':<10}")
print("-" * 60)

for name, result in sorted(results.items(), key=lambda x: x[1]["val_accuracy"], reverse=True):
    print(f"{name:<20} {result['val_accuracy']:.4f}{' '*10} {result['best_cv_score']:.4f}{' '*10} {result['time']:.2f}")

# Get best model
best_classifier = max(results.items(), key=lambda x: x[1]["val_accuracy"])
best_name = best_classifier[0]
best_result = best_classifier[1]

print(f"\n🏆 Best classifier: {best_name}")
print(f"✅ Validation accuracy: {best_result['val_accuracy']:.4f}")

# Detailed evaluation of best model
y_val_pred = best_result["best_model"].predict(X_val)
print(f"\n📊 Classification Report for {best_name}:\n")
print(classification_report(y_val, y_val_pred))

🔍 Starting grid search across classifiers...

Evaluating LogisticRegression...
✅ Best parameters: {'classifier': LogisticRegression(max_iter=1000, random_state=42), 'classifier__C': 10.0, 'classifier__solver': 'liblinear'}
✅ Cross-validation accuracy: 0.7071
✅ Validation accuracy: 0.7205
⏱️ Time: 1.39 seconds

Evaluating SVC...
✅ Best parameters: {'classifier': SVC(probability=True, random_state=42), 'classifier__C': 10.0, 'classifier__kernel': 'rbf'}
✅ Cross-validation accuracy: 0.7220
✅ Validation accuracy: 0.7205
⏱️ Time: 16.44 seconds

Evaluating RandomForest...
✅ Best parameters: {'classifier': RandomForestClassifier(random_state=42), 'classifier__max_depth': 20, 'classifier__n_estimators': 100}
✅ Cross-validation accuracy: 0.7182
✅ Validation accuracy: 0.7255
⏱️ Time: 3.61 seconds

Evaluating GradientBoosting...
✅ Best parameters: {'classifier': GradientBoostingClassifier(random_state=42), 'classifier__learning_rate': 0.1, 'classifier__n_estimators': 100}
✅ Cross-validation accur

In [10]:
from sklearn.ensemble import VotingClassifier, StackingClassifier

# Create base models from your best performers
gb = results["GradientBoosting"]["best_model"]
rf = results["RandomForest"]["best_model"]
svm = results["SVC"]["best_model"]

# Create voting ensemble
voting_clf = VotingClassifier(
    estimators=[('gb', gb), ('rf', rf), ('svm', svm)],
    voting='soft'  # Use probability estimates
)
voting_clf.fit(X_train, y_train)
vote_acc = accuracy_score(y_val, voting_clf.predict(X_val))
print(f"Voting Classifier Accuracy: {vote_acc:.4f}")

# Create stacking ensemble
stacking_clf = StackingClassifier(
    estimators=[('gb', gb), ('rf', rf), ('svm', svm)],
    final_estimator=LogisticRegression()
)
stacking_clf.fit(X_train, y_train)
stack_acc = accuracy_score(y_val, stacking_clf.predict(X_val))
print(f"Stacking Classifier Accuracy: {stack_acc:.4f}")

Voting Classifier Accuracy: 0.7450
Stacking Classifier Accuracy: 0.7443
