# Libraries Installation

In [16]:
%pip install numpy matplotlib scipy pandas seaborn scikit-learn statsmodels spacy gensim

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [18]:
%pip install --upgrade pandas
%pip install --upgrade numpy


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [22]:
%pip install spacy
!python -m spacy download xx_ent_wiki_sm

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting xx-ent-wiki-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.8.0/xx_ent_wiki_sm-3.8.0-py3-none-any.whl (11.1 MB)
     ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
      --------------------------------------- 0.3/11.1 MB ? eta -:--:--
     --- ------------------------------------ 1.0/11.1 MB 3.4 MB/s eta 0:00:03
     --------- ------------------------------ 2.6/11.1 MB 5.4 MB/s eta 0:00:02
     ------------- -------------------------- 3.7/11.1 MB 5.5 MB/s eta 0:00:02
     ----------------- ---------------------- 4.7/11.1 MB 5.5 MB/s eta 0:00:02
     ---------------------- ----------------- 6.3/11.1 MB 5.7 MB/s eta 0:00:01
     ----------------------------- ---------- 8.1/11.1 MB 6.2 M

In [23]:
import json
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, precision_recall_curve,
    average_precision_score)
import matplotlib.pyplot as plt
import seaborn as sns

# Load Word2Vec Embeddings
def load_word2vec_model(filepath):
    print("Loading Word2Vec model...")
    return KeyedVectors.load_word2vec_format(filepath, binary=False)

# Generate sentence embeddings
def generate_sentence_embeddings(texts, word2vec_model, vector_size):
    print("Generating sentence embeddings...")
    embeddings = []
    for text in texts:
        words = text.split()
        word_embeddings = [word2vec_model[word] for word in words if word in word2vec_model]
        if word_embeddings:
            embeddings.append(np.mean(word_embeddings, axis=0))
        else:
            embeddings.append(np.zeros(vector_size))
    return np.array(embeddings)

# Load JSON data
def load_json_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return pd.DataFrame(json.load(file))

# Data loading and preprocessing
train_data = load_json_data('D:/Github Projects/Machine Learning in Law/Phase (2) - Data Processing & EDA/Preprocessed Data/training_dataset.json')
development_data = load_json_data('D:/Github Projects/Machine Learning in Law/Phase (2) - Data Processing & EDA/Preprocessed Data/development_dataset.json')
test_data = load_json_data('D:/Github Projects/Machine Learning in Law/Phase (2) - Data Processing & EDA/Preprocessed Data/testing_dataset.json')

# Load Word2Vec model (Replace with your Word2Vec embeddings path)
# word2vec_model_path = "cc.tr.300.vec.gz"
word2vec_model_path = "./Phase (3) - Model Development/Decision Trees - Experiment/Word Embedding/cc.tr.300.vec.gz" # Turkish FastText embeddings
word2vec_model = load_word2vec_model(word2vec_model_path)
vector_size = word2vec_model.vector_size

# Generate features using Word2Vec embeddings
X_train = generate_sentence_embeddings(train_data['text'], word2vec_model, vector_size)
y_train = train_data['labels']

X_dev = generate_sentence_embeddings(development_data['text'], word2vec_model, vector_size)
y_dev = development_data['labels']

X_test = generate_sentence_embeddings(test_data['text'], word2vec_model, vector_size)
y_test = test_data['labels']

# Combine train and dev datasets
X_combined = np.vstack([X_train, X_dev])
y_combined = np.hstack([y_train, y_dev])

# Train Decision Tree Model
model = DecisionTreeClassifier(max_depth=10, min_samples_split=5, criterion='gini', random_state=40)
model.fit(X_combined, y_combined)

# Predictions and Probabilities
predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)[:, 1]

# Performance Metrics
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
class_report = classification_report(y_test, predictions)
roc_auc = roc_auc_score(y_test, probabilities)
cross_val_scores = cross_val_score(model, X_combined, y_combined, cv=5, scoring='accuracy')

# Decision Tree Visualization
plt.figure(figsize=(24, 12))
plot_tree(
    model,
    class_names=['Class 0', 'Class 1'],
    filled=True,
    rounded=True,
    fontsize=12,
    impurity=True,
    proportion=True
)
plt.title('Decision Tree Visualization', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Confusion Matrix Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, probabilities)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.tight_layout()
plt.show()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, probabilities)
avg_precision = average_precision_score(y_test, probabilities)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AP = {avg_precision:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.tight_layout()
plt.show()

# Learning Curve
train_sizes, train_scores, test_scores = learning_curve(
    model, X_combined, y_combined, cv=5, train_sizes=np.linspace(0.1, 1.0, 10)
)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, label='Training score')
plt.plot(train_sizes, test_mean, label='Cross-validation score')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1)
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1)
plt.xlabel('Training Examples')
plt.ylabel('Score')
plt.title('Learning Curve')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

# Save Results
results = {
    'accuracy': accuracy,
    'roc_auc': roc_auc,
    'cross_val_scores': cross_val_scores.tolist()
}
with open('model_results.json', 'w') as f:
    json.dump(results, f, indent=4)

# Performance Summary
print("\nModel Performance Summary")
print("-" * 50)
print(f"Accuracy: {accuracy:.3f}")
print(f"ROC-AUC: {roc_auc:.3f}")
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {np.mean(cross_val_scores):.3f}")
print("\nClassification Report:")
print(class_report)


AttributeError: 'numpy.ufunc' object has no attribute '__qualname__'

In [None]:
import json
import numpy as np
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve, precision_recall_curve,
    average_precision_score)
import matplotlib.pyplot as plt
import seaborn as sns

# Load spaCy Turkish language model
def load_spacy_model():
    print("Loading spaCy model...")
    return spacy.load('xx_ent_wiki_sm')

# Generate sentence embeddings
def generate_sentence_embeddings(texts, spacy_model, vector_size):
    print("Generating sentence embeddings...")
    embeddings = []
    for text in texts:
        doc = spacy_model(text)
        word_embeddings = [token.vector for token in doc if token.has_vector]
        if word_embeddings:
            embeddings.append(np.mean(word_embeddings, axis=0))
        else:
            embeddings.append(np.zeros(vector_size))
    return np.array(embeddings)

# Load JSON data
def load_json_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return pd.DataFrame(json.load(file))

# Data loading and preprocessing
train_data = load_json_data('D:/Github Projects/Machine Learning in Law/Phase (2) - Data Processing & EDA/Preprocessed Data/training_dataset.json')
development_data = load_json_data('D:/Github Projects/Machine Learning in Law/Phase (2) - Data Processing & EDA/Preprocessed Data/development_dataset.json')
test_data = load_json_data('D:/Github Projects/Machine Learning in Law/Phase (2) - Data Processing & EDA/Preprocessed Data/testing_dataset.json')

# Load spaCy model
spacy_model = load_spacy_model()
vector_size = spacy_model.vocab.vectors.shape[1]  # Get vector size

# Generate features using spaCy embeddings
X_train = generate_sentence_embeddings(train_data['text'], spacy_model, vector_size)
y_train = train_data['labels']

X_dev = generate_sentence_embeddings(development_data['text'], spacy_model, vector_size)
y_dev = development_data['labels']

X_test = generate_sentence_embeddings(test_data['text'], spacy_model, vector_size)
y_test = test_data['labels']

# Combine train and dev datasets
X_combined = np.vstack([X_train, X_dev])
y_combined = np.hstack([y_train, y_dev])

# Train Decision Tree Model
model = DecisionTreeClassifier(max_depth=10, min_samples_split=5, criterion='gini', random_state=40)
model.fit(X_combined, y_combined)

# Predictions and Probabilities
predictions = model.predict(X_test)
probabilities = model.predict_proba(X_test)[:, 1]

# Performance Metrics
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
class_report = classification_report(y_test, predictions)
roc_auc = roc_auc_score(y_test, probabilities)
cross_val_scores = cross_val_score(model, X_combined, y_combined, cv=5, scoring='accuracy')

# Decision Tree Visualization
plt.figure(figsize=(24, 12))
plot_tree(
    model,
    class_names=['Class 0', 'Class 1'],
    filled=True,
    rounded=True,
    fontsize=12,
    impurity=True,
    proportion=True
)
plt.title('Decision Tree Visualization', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# Confusion Matrix Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, probabilities)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.tight_layout()
plt.show()

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, probabilities)
avg_precision = average_precision_score(y_test, probabilities)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'AP = {avg_precision:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.tight_layout()
plt.show()

# Learning Curve
train_sizes, train_scores, test_scores = learning_curve(
    model, X_combined, y_combined, cv=5, train_sizes=np.linspace(0.1, 1.0, 10)
)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_mean, label='Training score')
plt.plot(train_sizes, test_mean, label='Cross-validation score')
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1)
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, alpha=0.1)
plt.xlabel('Training Examples')
plt.ylabel('Score')
plt.title('Learning Curve')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

# Save Results
results = {
    'accuracy': accuracy,
    'roc_auc': roc_auc,
    'cross_val_scores': cross_val_scores.tolist()
}
with open('model_results.json', 'w') as f:
    json.dump(results, f, indent=4)

# Performance Summary
print("\nModel Performance Summary")
print("-" * 50)
print(f"Accuracy: {accuracy:.3f}")
print(f"ROC-AUC: {roc_auc:.3f}")
print(f"Cross-Validation Scores: {cross_val_scores}")
print(f"Mean Cross-Validation Accuracy: {np.mean(cross_val_scores):.3f}")
print("\nClassification Report:")
print(class_report)
