In [None]:
#submitting programs    

In [1]:
"""
Nominal Head Feature Extractor from CoNLL-U Files (preprocessing script for the Logistic Regression model)
Author: [Raha Musavi]
Date: [2025-05-02]

This script processes annotated linguistic data in the CoNLL-U format to extract
features relevant to the study of nominal heads (nouns, pronouns, proper nouns)
and their direct dependents.


Workflow:
1.  Reads `.conllu` files from the specified input directory (`INPUT_FOLDER_PATH`).
2.  Iterates through each `.conllu` file.
3.  Parses each sentence within the files using the `conllu` library.
4.  For each token (word), it performs:
    a. ID Conversion: Handles standard integer IDs and sub-token IDs (e.g., '2.1').
    b. Deprel Standardization: Corrects known typos or variations in dependency
       relation labels using a predefined mapping (e.g., 'nm' -> 'nmod').
    c. Filtering: Ignores tokens with dependency relations deemed irrelevant
       for this analysis (e.g., 'punct', 'dep').
5.  Identifies tokens acting as nominal heads (NOUN, PROPN, PRON).
6.  For each nominal head, it finds its direct dependents within the same sentence.
7.  Extracts features characterizing the head-dependent relationship (see code).
8.  Aggregates these features from all files into a pandas DataFrame.
9.  Cleans the data by removing exact duplicate feature rows.
10. Optionally (controlled by `ADJUST_ZEROS_FLAG`) adjusts numerical columns
    containing zeros by adding 1 to all values in that column.
11. Saves the final feature set to the specified output CSV file (`OUTPUT_CSV_PATH`).

Dependencies:
- Python 3.6+
- pandas library (`pip install pandas`)
- python-conllu library (`pip install conllu`)
"""

import os
import pandas as pd
from conllu import parse_incr
from typing import List, Dict, Any, Tuple, Union, Optional, Set

# --- Configuration Constants ---

# Maps observed dependency label typos to their standard form
STANDARDIZED_DEPRELS: Dict[str, str] = {
    "nm": "nmod",
    "adjmod": "amod",
    "al:relcl": "acl:relcl",
    "acl:recl": "acl:relcl",
}

# Stopwords according to the dependency relations to exclude from the analysis entirely
STOPWORDS_DEPRELS: Set[str] = {'punct', 'dep', 'reparandum'}

# potential ezafe marker lemma
POTENTIAL_EZAFE_MARKERS: Set[str] = {'ī'} 

# Universal POS tags identifying nominal categories to be treated as heads
NOMINAL_UPOS_TAGS: Set[str] = {'NOUN', 'PROPN', 'PRON'}

# Universal POS tags identifying verbal categories
VERBAL_UPOS_TAGS: Set[str] = {'VERB', 'AUX'}

# --- Helper Functions ---

def standardize_deprel(deprel: Optional[str]) -> Optional[str]:
    """Standardizes a dependency label using the STANDARDIZED_DEPRELS mapping."""
    if deprel is None:
        return None
    return STANDARDIZED_DEPRELS.get(deprel, deprel)

def convert_token_id_to_float(token_id: Union[int, Tuple[int, str, int], str]) -> Optional[float]:
    """
    Converts CoNLL-U token IDs (int, sub-token tuple like (2,'.',1), numeric str)
    to floats. Returns None for range IDs ('1-2') or non-convertible types.
    """
    if isinstance(token_id, tuple) and len(token_id) == 3: # e.g., (2, '.', 1) for '2.1'
        try:
            return float(f"{int(token_id[0])}.{int(token_id[2])}")
        except (ValueError, TypeError, IndexError):
             print(f"Warning: Could not convert complex ID tuple {token_id} to float.")
             return None
    elif isinstance(token_id, (int, float)):
        return float(token_id)
    elif isinstance(token_id, str):
        if '.' in token_id and token_id.replace('.', '', 1).isdigit():
             try: return float(token_id)
             except ValueError: return None
        elif token_id.isdigit():
            try: return float(token_id)
            except ValueError: return None
        else:
            return None # Ignore ranges like '1-2'
    print(f"Warning: Unexpected token ID type {type(token_id)} ({token_id}).")
    return None

# --- Data Structure Class ---

class Token:
    """A simple container for relevant CoNLL-U token information."""
    def __init__(self, id_: float, form: Optional[str], lemma: Optional[str],
                 upos: Optional[str], head: Optional[int], deprel: Optional[str]):
        self.id: float = id_
        self.form: Optional[str] = form
        self.lemma: Optional[str] = lemma
        self.upos: Optional[str] = upos
        self.head: Optional[int] = head
        self.deprel: Optional[str] = deprel # Assumed standardized

# --- Core Logic Functions ---

def extract_np_features(sentence_tokens: List[Token], source_file: str) -> List[Dict[str, Any]]:
    """Extracts features for nominal head-dependent pairs within a single sentence."""
    nominal_features: List[Dict[str, Any]] = []

    for head_token in sentence_tokens:
        if head_token.upos in NOMINAL_UPOS_TAGS:
            dependents = [dep for dep in sentence_tokens if dep.head == int(head_token.id)]
            num_dependents_of_head = len(dependents)

            for dep_token in dependents:
                try:
                    distance = abs(dep_token.id - head_token.id)
                    position = 'before' if dep_token.id < head_token.id else 'after'
                    num_dependents_of_dependent = sum(1 for t in sentence_tokens if t.head == int(dep_token.id))
                    has_ezafe = any(
                        t.lemma in POTENTIAL_EZAFE_MARKERS and t.head == int(dep_token.id)
                        for t in sentence_tokens if t.lemma is not None and t.head is not None
                    )
                    is_verbal = int(dep_token.upos in VERBAL_UPOS_TAGS)

                    features = {
                        'nominal_head_id': head_token.id, 'nominal_head_lemma': head_token.lemma, 'nominal_head_upos': head_token.upos,
                        'dependent_id': dep_token.id, 'dependent_lemma': dep_token.lemma, 'dependent_upos': dep_token.upos,
                        'dependent_deprel': dep_token.deprel, 'distance': distance, 'position': position,
                        'num_dependents_nominal': num_dependents_of_head, 'num_dependents_dependent': num_dependents_of_dependent,
                        'ezafe_label': int(has_ezafe), 'is_verbal': is_verbal, 'source_file': source_file,
                    }
                    nominal_features.append(features)
                except Exception as e:
                     # Print error for specific problematic pair but continue
                     print(f"  Error extracting features for pair head={head_token.id}, dep={dep_token.id} in {source_file}: {e}")
                     continue # Skip this pair

    return nominal_features

def process_conllu_file(file_path: str) -> List[Dict[str, Any]]:
    """Reads a CoNLL-U file, processes each sentence, and extracts nominal features."""
    file_name = os.path.basename(file_path)
    all_features_in_file: List[Dict[str, Any]] = []
    print(f"Processing: {file_name}...")

    try:
        with open(file_path, 'r', encoding='utf-8') as infile:
            for sentence_idx, token_list in enumerate(parse_incr(infile)):
                sentence_id = token_list.metadata.get('sent_id', f'file_{file_name}_sent_{sentence_idx+1}')
                current_sentence_tokens: List[Token] = []

                for token_data in token_list:
                    if not all(k in token_data for k in ['id', 'form', 'upos', 'head', 'deprel']):
                         print(f"  Skipping malformed token data in {sentence_id}: {token_data}")
                         continue

                    token_id = convert_token_id_to_float(token_data['id'])
                    if token_id is None: continue

                    deprel_std = standardize_deprel(token_data['deprel'])
                    if deprel_std in STOPWORDS_DEPRELS: continue

                    head_id_raw = token_data['head']
                    head_id: Optional[int] = None
                    if head_id_raw is not None:
                        try: head_id = int(head_id_raw)
                        except (ValueError, TypeError):
                             print(f"  Warning: Invalid Head ID '{head_id_raw}' for token {token_id} in {sentence_id}. Skipping token.")
                             continue # Skip token if head ID is invalid

                    tok = Token(
                        id_=token_id, form=token_data.get('form'), lemma=token_data.get('lemma'),
                        upos=token_data.get('upos'), head=head_id, deprel=deprel_std
                    )
                    current_sentence_tokens.append(tok)

                if current_sentence_tokens:
                    try:
                        sentence_features = extract_np_features(current_sentence_tokens, file_name)
                        all_features_in_file.extend(sentence_features)
                    except Exception as e:
                         # Catch errors during feature extraction for the sentence
                         print(f"  Error processing sentence {sentence_id} features in {file_name}: {e}")
                         # Optionally log traceback: import traceback; traceback.print_exc()

    except FileNotFoundError:
        print(f"Error: File not found - {file_path}")
        return []
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return []

    print(f"  Finished {file_name}, found {len(all_features_in_file)} nominal pairs.")
    return all_features_in_file

def adjust_df_zeros(df: pd.DataFrame) -> pd.DataFrame:
    """Adds 1 to all values in numerical columns containing any zeros."""
    df_adjusted = df.copy()
    adjusted_cols = 0
    num_cols = df_adjusted.select_dtypes(include=['int64', 'float64']).columns
    for col in num_cols:
        # Add a check for column existence defensive programming
        if col in df_adjusted.columns and (df_adjusted[col] == 0).any():
            print(f"  Adjusting column '{col}' (+1) due to presence of zeros.")
            df_adjusted[col] = df_adjusted[col] + 1
            adjusted_cols += 1
    if adjusted_cols > 0:
        print(f"  Zero adjustment applied to {adjusted_cols} column(s).")
    else:
        print("  No zero adjustment needed for numerical columns.")
    return df_adjusted

# --------------------------------------------------------------------------
# --- Configuration ---
# --------------------------------------------------------------------------

INPUT_FOLDER_PATH = r'C:\Users\rahaa\Dropbox\MPCD\conllus_with_erros'

# Specify the desired path for the output CSV file
OUTPUT_CSV_PATH = r'C:\Users\rahaa\Dropbox\MPCD\LR-input.csv'

# Set to True to add 1 to numerical columns with zeros, False to disable
ADJUST_ZEROS_FLAG = True

# --------------------------------------------------------------------------
# --- Execution ---
# --------------------------------------------------------------------------

print("--- Starting Nominal Feature Extraction ---")

# --- Input Validation ---
if not os.path.isdir(INPUT_FOLDER_PATH):
    print(f"Error: Input folder not found: {INPUT_FOLDER_PATH}")
else:
    all_extracted_data = []
    print(f"Reading CoNLL-U files from: {INPUT_FOLDER_PATH}")

    # --- File Processing ---
    conllu_files = sorted([f for f in os.listdir(INPUT_FOLDER_PATH) if f.lower().endswith('.conllu')])

    if not conllu_files:
        print("Warning: No .conllu files found in the input folder.")
    else:
        print(f"Found {len(conllu_files)} CoNLL-U files to process.")
        for filename in conllu_files:
            file_path = os.path.join(INPUT_FOLDER_PATH, filename)
            if os.path.isfile(file_path):
                file_features = process_conllu_file(file_path)
                all_extracted_data.extend(file_features)

        if not all_extracted_data:
            print("Processing complete, but no features were extracted.")
        else:
            # --- Data Aggregation and Cleaning ---
            print("\nConsolidating features...")
            features_df = pd.DataFrame(all_extracted_data)
            print(f"Total features extracted (raw): {len(features_df)}")

            # Remove duplicates
            initial_rows = len(features_df)
            features_df.drop_duplicates(inplace=True)
            print(f"Features after removing duplicates: {len(features_df)} ({initial_rows - len(features_df)} removed)")

            # Adjust zeros if requested
            if ADJUST_ZEROS_FLAG:
                print("Performing zero adjustment on numerical columns...")
                features_df = adjust_df_zeros(features_df)
            else:
                print("Skipping zero adjustment.")

            # --- Output ---
            print(f"\nSaving final features ({len(features_df)} rows) to: {OUTPUT_CSV_PATH}")
            try:
                # Ensure output directory exists
                output_dir = os.path.dirname(OUTPUT_CSV_PATH)
                if output_dir and not os.path.exists(output_dir):
                    print(f"Creating output directory: {output_dir}")
                    os.makedirs(output_dir)

                features_df.to_csv(OUTPUT_CSV_PATH, index=False, encoding='utf-8')
                print("--- Processing complete. Features saved successfully. ---")
            except IOError as e:
                print(f"Error: Failed to write output file '{OUTPUT_CSV_PATH}'. Check path and permissions.")
                print(f"Details: {e}")
            except Exception as e:
                print(f"An unexpected error occurred during file saving: {e}")

print("--- Script execution finished. ---")


--- Starting Nominal Feature Extraction ---
Reading CoNLL-U files from: C:\Users\rahaa\Dropbox\MPCD\conllus_with_erros
Found 12 CoNLL-U files to process.
Processing: Col_RFS_TD2.conllu...
  Finished Col_RFS_TD2.conllu, found 32 nominal pairs.
Processing: DD-K35.conllu...
  Finished DD-K35.conllu, found 1991 nominal pairs.
Processing: DMX-L19.conllu...
  Finished DMX-L19.conllu, found 1447 nominal pairs.
Processing: Dk5_B.conllu...
  Finished Dk5_B.conllu, found 4528 nominal pairs.
Processing: Dk7-B.conllu...
  Finished Dk7-B.conllu, found 1258 nominal pairs.
Processing: GBd_TD1.conllu...
  Finished GBd_TD1.conllu, found 8202 nominal pairs.
Processing: NM_K35.conllu...
  Finished NM_K35.conllu, found 2476 nominal pairs.
Processing: NM_TD4a.conllu...
  Finished NM_TD4a.conllu, found 378 nominal pairs.
Processing: RAF-TD2.conllu...
  Finished RAF-TD2.conllu, found 657 nominal pairs.
Processing: RFS-TD2.conllu...
  Finished RFS-TD2.conllu, found 144 nominal pairs.
Processing: ZWY-K20.conll

In [2]:
"""
Feature Engineering for Logistic Regression Model on Middle Persian ezafe Data

Author: [Raha Musavi]
Date: [2025-05-02]

This script takes the preprocessed nominal features (output from the first script)
and performs feature engineering steps specifically tailored for the Logistic
Regression model, as described in Chapter 5, Section 5.4 of the thesis.

Workflow:
1.  Loads the base feature CSV file.
2.  Drops columns identified as trivial or redundant for this model
    (IDs, lemmata, head UPOS).
3.  Encodes the 'position' column numerically.
4.  Applies One-Hot Encoding to categorical features ('dependent_upos',
    'dependent_deprel').
5.  Generates interaction features between encoded 'dependent_upos' and
    numerical 'position'.
6.  Identifies all numerical features (original + newly created).
7.  Applies RobustScaler to standardize numerical features.
8.  Saves the final, engineered feature set to a new CSV file.

Dependencies:
- Python 3.6+
- pandas library (`pip install pandas`)
- scikit-learn library (`pip install scikit-learn`)
"""

import os
import pandas as pd
from sklearn.preprocessing import RobustScaler
import warnings

# Suppress warnings for cleaner output if desired
# warnings.filterwarnings("ignore")

# --- Configuration ---

# Input file path (output from the first preprocessing script)
INPUT_BASE_CSV_PATH = r'C:\Users\rahaa\Dropbox\MPCD\LR-input.csv' # <-- ADJUST if needed

# Output file path for the engineered features
OUTPUT_ENGINEERED_CSV_PATH = r'C:\Users\rahaa\Dropbox\MPCD\LR-input-engineered.csv' # <-- ADJUST if needed

# Columns to drop based on thesis Section 5.4.1
COLUMNS_TO_DROP = [
    "nominal_head_id",
    "dependent_id",
    "nominal_head_lemma",
    "nominal_head_upos", # Dropped as per 5.4.1
    "dependent_lemma",
    # 'source_file' # Keep source_file for now, might be used later or dropped in model training script
]

# Categorical columns for One-Hot Encoding (OHE)
CATEGORICAL_COLS_OHE = ['dependent_upos', 'dependent_deprel']

# Original numerical columns + position
BASE_NUMERIC_COLS = [
    "distance",
    "num_dependents_nominal",
    "num_dependents_dependent",
    "is_verbal" #a simple numeric (0/1) feature
]

# --- Main Script ---

if __name__ == "__main__":
    print("--- Starting Feature Engineering for Logistic Regression ---")

    # --- 1. Load Data ---
    print(f"Loading base features from: {INPUT_BASE_CSV_PATH}")
    if not os.path.exists(INPUT_BASE_CSV_PATH):
        raise FileNotFoundError(f"Input file not found: {INPUT_BASE_CSV_PATH}")
    try:
        df = pd.read_csv(INPUT_BASE_CSV_PATH)
        print(f"Loaded {len(df)} rows.")
    except Exception as e:
        raise IOError(f"Error loading CSV file: {e}")

    # --- 2. Drop Trivial Columns ---
    print(f"Dropping specified columns: {COLUMNS_TO_DROP}")
    df.drop(columns=COLUMNS_TO_DROP, inplace=True, errors='ignore')
    print(f"Columns remaining: {df.columns.tolist()}")

    # --- 3. Encode Position ---
    print("Encoding 'position' numerically ('before': 2, 'after': 1)...")
    if "position" in df.columns:
        df['position_numeric'] = df['position'].map({'before': 2, 'after': 1})
        # Check for any values that weren't mapped (e.g., NaN or unexpected strings)
        if df['position_numeric'].isnull().any():
            print("Warning: Found null values after mapping 'position'. Filling with a default (e.g., 0 or median) might be needed depending on analysis.")
            # Example fill: df['position_numeric'].fillna(0, inplace=True) # Or another strategy
        df.drop(columns=['position'], inplace=True) # Drop original string column
    else:
        print("Warning: 'position' column not found for encoding.")

    # Add the new numeric position column to our list
    if 'position_numeric' in df.columns and 'position_numeric' not in BASE_NUMERIC_COLS:
        BASE_NUMERIC_COLS.append('position_numeric')

    # --- 4. One-Hot Encode Categorical Features ---
    print(f"Applying One-Hot Encoding to: {CATEGORICAL_COLS_OHE}")
    initial_cols = set(df.columns)
    df = pd.get_dummies(df, columns=CATEGORICAL_COLS_OHE, prefix=CATEGORICAL_COLS_OHE, dummy_na=False)
    new_ohe_cols = list(set(df.columns) - initial_cols)
    print(f"Created {len(new_ohe_cols)} new columns from One-Hot Encoding.")

    # Separate UPOS columns for interaction step
    upos_ohe_cols = [col for col in new_ohe_cols if col.startswith('dependent_upos_')]

    # --- 5. Generate Interaction Features ---
    print("Generating interaction features (UPOS * Position)...")
    interaction_feature_names = []
    if 'position_numeric' in df.columns and upos_ohe_cols:
        for upos_col in upos_ohe_cols:
            interaction_col_name = f"{upos_col}_x_position"
            df[interaction_col_name] = df[upos_col] * df['position_numeric']
            interaction_feature_names.append(interaction_col_name)
        print(f"Generated {len(interaction_feature_names)} interaction features.")
    elif 'position_numeric' not in df.columns:
         print("Skipping interaction generation: 'position_numeric' column not available.")
    else:
         print("Skipping interaction generation: No UPOS OHE columns found.")


    # --- 6. Identify All Numerical Features for Scaling ---
    # Includes original numerics, encoded position, is_verbal, OHE columns (0/1), interactions
    print(f"Numerical features identified for scaling: {BASE_NUMERIC_COLS}")

    # --- 7. Apply RobustScaler ---
    print("Applying RobustScaler...")
    if all(col in df.columns for col in BASE_NUMERIC_COLS):
        scaler = RobustScaler()
        df[BASE_NUMERIC_COLS] = scaler.fit_transform(df[BASE_NUMERIC_COLS])
        print("RobustScaler applied successfully.")
    else:
        missing_numeric = [col for col in BASE_NUMERIC_COLS if col not in df.columns]
        print(f"Warning: Could not apply RobustScaler. Missing numerical columns: {missing_numeric}")


    # --- 8. Save Engineered Features ---
    print(f"\nSaving engineered features ({len(df)} rows) to: {OUTPUT_ENGINEERED_CSV_PATH}")
    try:
        # Ensure output directory exists
        output_dir = os.path.dirname(OUTPUT_ENGINEERED_CSV_PATH)
        if output_dir and not os.path.exists(output_dir):
            print(f"Creating output directory: {output_dir}")
            os.makedirs(output_dir)

        df.to_csv(OUTPUT_ENGINEERED_CSV_PATH, index=False, encoding='utf-8')
        print("--- Feature Engineering complete. Engineered features saved successfully. ---")
    except IOError as e:
        print(f"Error: Failed to write output file '{OUTPUT_ENGINEERED_CSV_PATH}'. Check path and permissions.")
        print(f"Details: {e}")
    except Exception as e:
        print(f"An unexpected error occurred during file saving: {e}")

print("--- Script execution finished. ---")

--- Starting Feature Engineering for Logistic Regression ---
Loading base features from: C:\Users\rahaa\Dropbox\MPCD\LR-input.csv
Loaded 23238 rows.
Dropping specified columns: ['nominal_head_id', 'dependent_id', 'nominal_head_lemma', 'nominal_head_upos', 'dependent_lemma']
Columns remaining: ['dependent_upos', 'dependent_deprel', 'distance', 'position', 'num_dependents_nominal', 'num_dependents_dependent', 'ezafe_label', 'is_verbal', 'source_file']
Encoding 'position' numerically ('before': 2, 'after': 1)...
Applying One-Hot Encoding to: ['dependent_upos', 'dependent_deprel']
Created 66 new columns from One-Hot Encoding.
Generating interaction features (UPOS * Position)...
Generated 15 interaction features.
Numerical features identified for scaling: ['distance', 'num_dependents_nominal', 'num_dependents_dependent', 'is_verbal', 'position_numeric']
Applying RobustScaler...
RobustScaler applied successfully.

Saving engineered features (23238 rows) to: C:\Users\rahaa\Dropbox\MPCD\LR-inp

In [7]:
# -*- coding: utf-8 -*-
"""
Logistic Regression Model - EXACT THESIS REPLICATION SCRIPT

Author: [Raha Musavi]
Date: [2025-05-02]

This script replicates the SPECIFIC workflow from the final cell (f5988c27)
of the development notebook 'testthemodelsresults.ipynb', which generated
the Logistic Regression results reported in the thesis document (Figure 15).

Key characteristics replicated from that specific cell:
- Input: 'nominal_features_cleaned.csv' (assumed equivalent to LR-input.csv)
- Scaler: StandardScaler (NOT RobustScaler)
- Feature Selection: Importance-based selection with threshold 0.15 applied
  after an initial model fit.
- Oversampling: Applied BEFORE the train-test split.
- Train/Test Split: 70/30 on the OVERSAMPLED data.
- Final Model Base: LogisticRegression(C=10, ...)
- Final Hyperparameter Grid: Only tunes 'solver' = ['newton-cg', 'liblinear']
- Final CV Folds: 10

**Methodological Note:** This script applies oversampling before the train-test
split to match the specific methodology that produced the thesis results.
Standard best practice usually recommends splitting first, then oversampling
only the training set. Evaluation here is performed on the test set derived
from the OVERSAMPLED data.

Workflow:
1.  Loads the base feature dataset ('nominal_features_cleaned.csv').
2.  Performs initial feature engineering (position encoding, OHE).
3.  Scales numeric features using StandardScaler.
4.  Trains an initial model to calculate feature importance.
5.  Selects features based on importance threshold (0.15).
6.  Applies RandomOverSampler to the feature-selected dataset.
7.  Splits the OVERSAMPLED data into training (70%) and testing (30%) sets.
8.  Defines the final Logistic Regression model (C=10) and the specific hyperparameter grid (solver only).
9.  Uses GridSearchCV with 10-fold cross-validation on the OVERSAMPLED training data.
10. Fits the GridSearchCV object.
11. Retrieves the best model.
12. Makes predictions on the OVERSAMPLED test set.
13. Evaluates the model using accuracy, classification report, and confusion matrix.
14. Saves results to files, clearly marked as thesis replication.

Dependencies:
- Python 3.6+
- pandas
- scikit-learn
- imbalanced-learn
- numpy
- matplotlib
- seaborn
- json
"""

import os
import pandas as pd
import numpy as np
import warnings
import json
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
# Import StandardScaler instead of RobustScaler for this specific replication
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
import seaborn as sns

# --- Configuration ---

# Input file path (MUST match the file used for the thesis results)
INPUT_BASE_CSV_PATH = r'C:\Users\rahaa\Dropbox\MPCD\nominal_features_cleaned.csv' # ADJUST IF NEEDED

# Target column name
TARGET_COLUMN = 'ezafe_label'

# Train-test split configuration (applied AFTER oversampling)
TEST_SET_SIZE = 0.30 # 70/30 split as per thesis script
RANDOM_STATE = 42

# Feature Selection Threshold from thesis script
IMPORTANCE_THRESHOLD = 0.15

# Oversampler configuration
OVERSAMPLER = RandomOverSampler(random_state=RANDOM_STATE)

# Logistic Regression base model configuration for FINAL tuning
# Matching the hardcoded C=10 from the target cell
LOGREG_FINAL_MODEL = LogisticRegression(
    C=10, # Hardcoded C=10 as per thesis script
    max_iter=1000, # Increase max_iter for robustness
    class_weight='balanced',
    random_state=RANDOM_STATE
)

# GridSearchCV configuration - ONLY tuning solver as per thesis script
PARAM_GRID = {
    'solver': ['newton-cg', 'liblinear'] # Exact grid from target cell
}
CV_FOLDS = 10 # 10 folds as per thesis script
GRID_SEARCH_CV = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)

# Paths for saving results
RESULTS_DIR = 'results_thesis_replication_lr_exact' # New distinct directory name
REPORT_FILE_PATH = os.path.join(RESULTS_DIR, 'lr_evaluation_report_thesis_exact.txt')
PARAMS_FILE_PATH = os.path.join(RESULTS_DIR, 'lr_best_params_thesis_exact.json')
CM_PLOT_FILE_PATH = os.path.join(RESULTS_DIR, 'lr_confusion_matrix_thesis_exact.png')
CM_DATA_FILE_PATH = os.path.join(RESULTS_DIR, 'lr_confusion_matrix_data_thesis_exact.csv')
IMPORTANCE_CSV_PATH = os.path.join(RESULTS_DIR, "lr_feature_importance_thesis_exact.csv")
SELECTED_FEATURES_PATH = os.path.join(RESULTS_DIR, "lr_selected_features_thesis_exact.txt") # Save selected features

# --- Main Script ---

if __name__ == "__main__":
    print("--- Starting LR Model - EXACT THESIS REPLICATION WORKFLOW ---")
    warnings.filterwarnings("ignore")

    # --- Create results directory ---
    if not os.path.exists(RESULTS_DIR):
        print(f"Creating results directory: {RESULTS_DIR}")
        os.makedirs(RESULTS_DIR)

    # --- 1. Load Data ---
    print(f"Loading base features from: {INPUT_BASE_CSV_PATH}")
    if not os.path.exists(INPUT_BASE_CSV_PATH):
        raise FileNotFoundError(f"Input file not found: {INPUT_BASE_CSV_PATH}")
    try:
        nominals_df = pd.read_csv(INPUT_BASE_CSV_PATH)
        print(f"Loaded {len(nominals_df)} rows.")
    except Exception as e:
        raise IOError(f"Error loading CSV file: {e}")

    # --- 2. Initial Feature Engineering ---
    print("Performing initial feature engineering...")
    nominals_df.drop(columns=["nominal_head_id", "dependent_id"], inplace=True, errors='ignore')

    if "position" in nominals_df.columns:
        nominals_df['position_numeric'] = nominals_df['position'].map({'before': 2, 'after': 1}).fillna(0) # Added fillna
        nominals_df.drop(columns=['position'], inplace=True, errors='ignore') # Added errors='ignore'
    else: print("Warning: 'position' column not found.")

    initial_numeric_features = ["distance", "num_dependents_nominal", "num_dependents_dependent", "position_numeric"]

    # OHE dependent_deprel (using OneHotEncoder as per target cell)
    deprel_columns = []
    if "dependent_deprel" in nominals_df.columns:
        deprel_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False, drop=None)
        nominals_df['dependent_deprel'].fillna('missing_deprel', inplace=True)
        deprel_encoded = deprel_encoder.fit_transform(nominals_df[["dependent_deprel"]])
        deprel_columns = deprel_encoder.get_feature_names_out() # Correct usage
        deprel_df = pd.DataFrame(deprel_encoded, columns=deprel_columns, index=nominals_df.index)
        nominals_df = pd.concat([nominals_df.drop(columns=['dependent_deprel']), deprel_df], axis=1)
        print(f"Created {len(deprel_columns)} deprel OHE columns.")
    else: print("Warning: 'dependent_deprel' not found.")

    # OHE dependent_upos (using get_dummies as per target cell)
    upos_columns = []
    if "dependent_upos" in nominals_df.columns:
        nominals_df['dependent_upos'].fillna('missing_upos', inplace=True)
        upos_dummies = pd.get_dummies(nominals_df["dependent_upos"], prefix="dependent_upos", dummy_na=False)
        nominals_df = pd.concat([nominals_df.drop(columns=['dependent_upos']), upos_dummies], axis=1)
        upos_columns = list(upos_dummies.columns)
        print(f"Created {len(upos_columns)} UPOS OHE columns.")
    else: print("Warning: 'dependent_upos' not found.")

    # Interaction Terms (for *all* UPOS dummies as per target cell)
    interaction_term_names = []
    if 'position_numeric' in nominals_df.columns and upos_columns:
        interaction_terms = pd.DataFrame(index=nominals_df.index)
        for col in upos_columns:
            interaction_col_name = f"{col}_position_interaction"
            interaction_terms[interaction_col_name] = nominals_df[col] * nominals_df["position_numeric"]
            interaction_term_names.append(interaction_col_name)
        nominals_df = pd.concat([nominals_df, interaction_terms], axis=1)
        print(f"Generated {len(interaction_term_names)} interaction features.")
    else: print("Skipping interaction generation due to missing columns.")

    # --- Define X and y (before feature selection) ---
    print("Defining initial feature set and target...")
    if TARGET_COLUMN not in nominals_df.columns:
        raise KeyError(f"Target column '{TARGET_COLUMN}' not found.")
    y = nominals_df[TARGET_COLUMN]

    # Define features including 'is_verbal' if present
    base_feature_cols = [col for col in initial_numeric_features if col in nominals_df.columns]
    if "is_verbal" in nominals_df.columns: base_feature_cols.append("is_verbal")
    else: print("Warning: 'is_verbal' column not found.")

    feature_cols_for_X = (
         base_feature_cols
         + upos_columns
         + interaction_term_names
         + list(deprel_columns)
    )
    # Ensure columns exist and remove duplicates
    feature_cols_for_X = [col for col in pd.unique(feature_cols_for_X) if col in nominals_df.columns]
    X_full = nominals_df[feature_cols_for_X].copy()

    # Drop remaining non-numeric columns (like lemmas)
    non_numeric_in_X = X_full.select_dtypes(exclude=np.number).columns
    if not non_numeric_in_X.empty:
        print(f"Dropping non-numeric columns found before scaling: {non_numeric_in_X.tolist()}")
        X_full.drop(columns=non_numeric_in_X, inplace=True)

    # Identify numeric cols for scaling *within X_full*
    numeric_features_to_scale = [col for col in initial_numeric_features if col in X_full.columns]

    # --- 3. Scale Numeric Features (using StandardScaler) ---
    print(f"Applying StandardScaler to: {numeric_features_to_scale}")
    if numeric_features_to_scale:
        # Check/Handle NaN/Inf before scaling
        numeric_subset = X_full[numeric_features_to_scale]
        if numeric_subset.isnull().values.any() or np.isinf(numeric_subset.values).any():
             print(f"Warning: NaN/inf values found before scaling. Filling with 0.")
             for col in numeric_features_to_scale:
                  X_full[col] = X_full[col].replace([np.inf, -np.inf], np.nan).fillna(0)
        scaler = StandardScaler() # Using StandardScaler as per target cell
        X_full.loc[:, numeric_features_to_scale] = scaler.fit_transform(X_full[numeric_features_to_scale])
    else: print("Warning: No numeric features to scale.")

    # --- 4. Initial Model for Feature Importance ---
    print("Training initial model for feature importance...")
    # Final check/fill NaN/Inf
    if X_full.isnull().values.any() or np.isinf(X_full.values).any():
        print(f"Warning: NaN/inf values found before initial fit. Filling with 0.")
        X_full = X_full.fillna(0)
    initial_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE)
    try:
        initial_model.fit(X_full, y)
    except Exception as e: print(f"Error fitting initial model: {e}"); raise e

    # --- 5. Importance-Based Feature Selection ---
    print(f"Selecting features with absolute importance > {IMPORTANCE_THRESHOLD}...")
    if hasattr(initial_model, 'coef_'):
        feature_importance = pd.DataFrame({
            "Feature": X_full.columns, "Coefficient Value": initial_model.coef_.flatten()})
        feature_importance["Absolute Importance"] = feature_importance["Coefficient Value"].abs()
        feature_importance = feature_importance.sort_values(by="Absolute Importance", ascending=False).round(5)
        selected_features = feature_importance[feature_importance["Absolute Importance"] > IMPORTANCE_THRESHOLD]["Feature"].tolist()
        if not selected_features:
            print(f"Warning: No features met importance threshold {IMPORTANCE_THRESHOLD}! Using all features.")
            selected_features = X_full.columns.tolist()
        else: print(f"Selected {len(selected_features)} features out of {X_full.shape[1]}.")
        X_final = X_full[selected_features].copy() # Use .copy()
        # Save importance and selected features
        try:
            feature_importance.to_csv(IMPORTANCE_CSV_PATH, index=False)
            print(f"Feature importance saved to: {IMPORTANCE_CSV_PATH}")
            with open(SELECTED_FEATURES_PATH, 'w') as f:
                for feature in selected_features: f.write(f"{feature}\n")
            print(f"Selected feature list saved to: {SELECTED_FEATURES_PATH}")
        except Exception as e: print(f"Error saving importance/selection files: {e}")
    else:
        print("Error: Initial model has no coefficients. Cannot perform importance selection.")
        X_final = X_full # Fallback to using all features

    # --- 6. Apply Oversampling BEFORE Train/Test Split ---
    print("Applying RandomOverSampler to the feature-selected dataset...")
    X_resampled, y_resampled = OVERSAMPLER.fit_resample(X_final, y)
    print(f"Resampled dataset size: {X_resampled.shape[0]} samples")
    print("Value counts in resampled target:\n", pd.Series(y_resampled).value_counts())

    # --- 7. Split OVERSAMPLED Data (Train/Test) ---
    print(f"Splitting OVERSAMPLED data into Train (70%) / Test (30%) sets...")
    X_train, X_test, y_train, y_test = train_test_split(
        X_resampled, y_resampled, test_size=TEST_SET_SIZE, random_state=RANDOM_STATE)
    print(f"Training set size: {X_train.shape[0]}, Test set size: {X_test.shape[0]}")

    # --- 8 & 9. Setup and Run GridSearchCV ---
    print(f"\nStarting GridSearchCV with {CV_FOLDS}-fold CV (tuning only solver)...")
    grid_search = GridSearchCV(
        estimator=LOGREG_FINAL_MODEL, # Uses C=10 base model
        param_grid=PARAM_GRID,        # Uses solver-only grid
        cv=GRID_SEARCH_CV,
        scoring='accuracy',
        n_jobs=-1, verbose=1, refit=True)

    # --- 10. Fit GridSearchCV ---
    grid_search.fit(X_train, y_train)

    # --- 11. Get Best Model and Results ---
    print("\nGridSearchCV finished.")
    best_params = grid_search.best_params_
    best_cv_score = grid_search.best_score_
    # Note: best_params will only contain 'solver' here
    print(f"Best parameters found: {{'C': 10, **best_params}}") # Manually add C=10 for clarity
    print(f"Best cross-validation accuracy score: {best_cv_score:.4f}")
    best_model = grid_search.best_estimator_

    # --- 12. Make Predictions on OVERSAMPLED Test Set ---
    print("\nMaking predictions on the OVERSAMPLED test set...")
    y_pred = best_model.predict(X_test)

    # --- 13. Evaluate Model ---
    print("\n--- Evaluation Results on OVERSAMPLED Test Set ---")
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy (on oversampled test set): {test_accuracy:.4f}")
    print("\nClassification Report (on oversampled test set):")
    report_str = classification_report(y_test, y_pred, target_names=['Ezafe Absent (0)', 'Ezafe Present (1)'])
    print(report_str)
    print("\nConfusion Matrix (on oversampled test set):")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

    # --- 14. Save Results to Files ---
    print(f"\n--- Saving Evaluation Results to '{RESULTS_DIR}' ---")
    # (Saving logic - same as before, using variables derived above)
    # Ensure results directory exists
    if not os.path.exists(RESULTS_DIR):
        print(f"Creating results directory: {RESULTS_DIR}")
        os.makedirs(RESULTS_DIR)
    try:
        with open(REPORT_FILE_PATH, 'w', encoding='utf-8') as f:
            f.write("Logistic Regression Model Evaluation Report (Exact Thesis Replication Workflow)\n")
            f.write("===============================================================================\n\n")
            f.write("** WARNING: Evaluation performed on test split derived from OVERSAMPLED data, matching specific thesis methodology. **\n\n")
            f.write(f"Input Data: {INPUT_BASE_CSV_PATH}\n")
            f.write(f"Feature Selection Threshold: {IMPORTANCE_THRESHOLD}\n")
            f.write(f"Number of Selected Features: {len(selected_features)}\n\n")
            f.write(f"Best Hyperparameters Found by GridSearchCV (C fixed at 10):\n{{'C': 10, **best_params}}\n\n") # Reflect fixed C
            f.write(f"Best Cross-Validation Accuracy Score: {best_cv_score:.4f}\n\n")
            f.write(f"Test Set Accuracy (on oversampled split): {test_accuracy:.4f}\n\n")
            f.write("Test Set Classification Report (on oversampled split):\n")
            f.write(report_str)
            f.write("\n\nTest Set Confusion Matrix (on oversampled split):\n")
            f.write(np.array2string(cm, separator=', '))
        print(f"Evaluation report saved to: {REPORT_FILE_PATH}")
    except Exception as e: print(f"Error saving text report: {e}")

    try:
        # Save the actual best params found (solver) along with the fixed C
        final_params_to_save = {'C': 10, **best_params}
        with open(PARAMS_FILE_PATH, 'w', encoding='utf-8') as f: json.dump(final_params_to_save, f, indent=4)
        print(f"Best parameters saved to: {PARAMS_FILE_PATH}")
    except Exception as e: print(f"Error saving parameters JSON: {e}")

    try:
        plt.figure(figsize=(6, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                    xticklabels=['Predicted Absent (0)', 'Predicted Present (1)'],
                    yticklabels=['Actual Absent (0)', 'Actual Present (1)'])
        plt.ylabel('Actual Label')
        plt.xlabel('Predicted Label')
        plt.title('Confusion Matrix - Oversampled Test Set (Thesis Rep)')
        plt.tight_layout()
        plt.savefig(CM_PLOT_FILE_PATH, dpi=300)
        print(f"Confusion matrix plot saved to: {CM_PLOT_FILE_PATH}")
        plt.show()
        plt.close()
    except ImportError: print("\n(Install matplotlib and seaborn to save plot)")
    except Exception as e: print(f"Error saving confusion matrix plot: {e}")

    try:
        cm_df = pd.DataFrame(cm, index=['Actual Absent (0)', 'Actual Present (1)'], columns=['Predicted Absent (0)', 'Predicted Present (1)'])
        cm_df.to_csv(CM_DATA_FILE_PATH)
        print(f"Confusion matrix data saved to: {CM_DATA_FILE_PATH}")
    except Exception as e: print(f"Error saving confusion matrix data: {e}")

    print("\n--- Script execution finished. ---")

--- Starting LR Model - EXACT THESIS REPLICATION WORKFLOW ---
Creating results directory: results_thesis_replication_lr_exact
Loading base features from: C:\Users\rahaa\Dropbox\MPCD\nominal_features_cleaned.csv


FileNotFoundError: Input file not found: C:\Users\rahaa\Dropbox\MPCD\nominal_features_cleaned.csv