In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter

# we load the dataset
file_path = r''
df = pd.read_csv(file_path)

# we then inspect the dataset here
print(df.head())

#tokenize it, which means we remove lowercases and capital letters and all
def tokenize(text):
    # also remove everything that isnt alphabetical
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = text.lower().split()
    return tokens

# removing all the stop words manually
stopwords = set([
    'the', 'and', 'is', 'in', 'it', 'of', 'to', 'a', 'on', 'with', 'as', 'for', 'was', 'were', 'by', 'an', 'this', 'that'
])

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stopwords]

# now apply this to the dataset
df['processed_summary'] = df['summary'].apply(lambda x: remove_stopwords(tokenize(x)))

# now we'll check the processed data
print(df['processed_summary'].head())


In [None]:
# now we get the words from scratch
def build_vocabulary(tokenized_texts):
    # make a list for each and every word
    all_words = [word for tokens in tokenized_texts for word in tokens]
    
    # get the count of the word, how many time it has occured, frequency
    word_counts = Counter(all_words)
    
    # now we map from word to indices
    vocabulary = {word: i for i, word in enumerate(word_counts)}
    
    return vocabulary

# now we build the vocabulary over here
vocabulary = build_vocabulary(df['processed_summary'])
vocab_size = len(vocabulary)
print(f"Vocabulary Size: {vocab_size}")

# now we'll turn text into numerical values to store them easily into vectors
def text_to_bow(text, vocabulary):
    vector = np.zeros(len(vocabulary))
    
    for word in text:
        if word in vocabulary:
            vector[vocabulary[word]] += 1
            
    return vector

# now we convert the summaries to the bow vector
df['bow_vector'] = df['processed_summary'].apply(lambda x: text_to_bow(x, vocabulary))

# now we convert it to a matrix using numpy arrays
X = np.array(df['bow_vector'].tolist())
y = np.array(df['is_factual'].values)

# we check the shape of x and y now
print(f"Input Shape: {X.shape}")
print(f"Output Shape: {y.shape}")


In [None]:

# here we manually split it all into 80-20, testing and training
def train_test_split_manual(X, y, test_size=0.2):
    
    split_idx = int(X.shape[0] * (1 - test_size))
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split_manual(X, y)

# display the size pls
print(f"Training Data Shape: {X_train.shape}")
print(f"Testing Data Shape: {X_test.shape}")

In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter


file_path = r''
df = pd.read_csv(file_path)

# now we check any nan values which means hter ewas some reading error
print("Checking for NaN values in the original dataset:")
print(df.isnull().sum())  # check it for every column 

# we remove the rows where itsfactual is nan
df = df.dropna(subset=['is_factual'])

# now we change the is_factual column to binary numbers, one for yea and 2 for no
df['is_factual'] = df['is_factual'].map({'yes': 1, 'no': 0})

# print values in it to ensure we're doing it correctly
print("Unique values in 'is_factual':", df['is_factual'].unique())

# tokenize once again
def tokenize(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = text.lower().split()
    return tokens

#removing stopwrods again
stopwords = set([
    'the', 'and', 'is', 'in', 'it', 'of', 'to', 'a', 'on', 'with', 'as', 'for', 'was', 'were', 'by', 'an', 'this', 'that'
])

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stopwords]

#applying tokenizationa nd removing stopwords from the dataset
df['processed_summary'] = df['summary'].apply(lambda x: remove_stopwords(tokenize(x)))

# we check for nan value once again, nahin hone chahiyein
print("Checking for NaN values after processing:")
print(df[['summary', 'is_factual', 'processed_summary']].isnull().sum())  #har column ko check kareigay

# bow dobarah implement kareingay, bow is bag of words btw
def build_vocabulary(tokenized_texts):
    all_words = [word for tokens in tokenized_texts for word in tokens]
    word_counts = Counter(all_words)
    vocabulary = {word: i for i, word in enumerate(word_counts)}
    return vocabulary

# building he vocab
vocabulary = build_vocabulary(df['processed_summary'])
vocab_size = len(vocabulary)
print(f"Vocabulary Size: {vocab_size}")

# convert text to numerics using bow
def text_to_bow(text, vocabulary):
    vector = np.zeros(len(vocabulary))
    for word in text:
        if word in vocabulary:
            vector[vocabulary[word]] += 1
    return vector

# now we convert all summariws to bow
df['bow_vector'] = df['processed_summary'].apply(lambda x: text_to_bow(x, vocabulary))

# now we convert to matrix using numpy arrays
X = np.array(df['bow_vector'].tolist())
y = np.array(df['is_factual'].values)

# check if there is nan in teh dataset
if np.any(np.isnan(X)) or np.any(np.isnan(y)):
    print("There are NaN values in the dataset after BoW conversion.")

# here we'll normalize the feature function
X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)

# we have to check if normalization leads to nan here
if np.any(np.isnan(X)):
    print("Normalization led to NaN values in the features.")

# splittting once again, 80-20
def train_test_split_manual(X, y, test_size=0.2):
    split_idx = int(X.shape[0] * (1 - test_size))
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split_manual(X, y)

# finally we implement logistic regresion
class LogisticRegression:
    def __init__(self, learning_rate=0.0001, n_iters=1000):
        self.learning_rate = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None
    
    def fit(self, X, y):
        n_samples, n_features = X.shape
        #initialize weights randomly
        self.weights = np.random.randn(n_features) * 0.01  
        self.bias = 0
        
        for i in range(self.n_iters):
            #this is for the linear model
            linear_model = np.dot(X, self.weights) + self.bias
            # Sigmoid function
            y_predicted = sigmoid(linear_model)
            
            # print intermediate values so i can debug you
            if i % 100 == 0:
                print(f'Iteration {i}: Predictions = {y_predicted[:10]}')

          #hwere we'll compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)
            
            # here we update the weights and the biases
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            
            # just tractking loss to see how much did i lose
            if i % 100 == 0:
                loss = binary_cross_entropy(y, y_predicted)
                print(f'Iteration {i}: Loss = {loss}')
    
    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = sigmoid(linear_model)
        return [1 if i > 0.5 else 0 for i in y_predicted]

# formula for the sigmoid, a ftn
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

#the loss ftn, binary method, cross entropy
def binary_cross_entropy(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-10, 1 - 1e-10)  # Clip to prevent log(0)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

# we initialize and train the model here
model = LogisticRegression(learning_rate=0.0001, n_iters=1000)  # Use the reduced learning rate
model.fit(X_train, y_train)

# now we predict
y_pred = model.predict(X_test)

# here we check the predictions
print(f"Predictions: {y_pred[:10]}")
print(f"Actual: {y_test[:10]}")

In [None]:
# now we check class distribution
class_counts = pd.Series(y_train).value_counts()
print("Class distribution in the training set:")
print(class_counts)

# here we are identifying the samplesof each class
num_zeros = class_counts[0]
num_ones = class_counts[1]

# if the minority isnt being represented as well
if num_ones < num_zeros:
    # how many samples are needed to sample teh classes
    n_samples_to_add = num_zeros - num_ones

    # here we get the indices of the minority classes
    minority_indices = np.where(y_train == 1)[0]

    # here we randomly select which samples we want to add
    np.random.seed(42)  
    selected_indices = np.random.choice(minority_indices, n_samples_to_add, replace=True)

    # and we make new samples for the minorities
    X_train_oversampled = np.vstack([X_train, X_train[selected_indices]])
    y_train_oversampled = np.hstack([y_train, y_train[selected_indices]])
else:
    # if the classes have finally being sampled
    X_train_oversampled = X_train
    y_train_oversampled = y_train

#check new class distribution
new_class_counts = pd.Series(y_train_oversampled).value_counts()
print("New class distribution after oversampling:")
print(new_class_counts)

# now we continue with our model training
model = LogisticRegression(learning_rate=0.01, n_iters=1000)
model.fit(X_train_oversampled, y_train_oversampled)

# here we predict
y_pred = model.predict(X_test)


In [None]:
# Load your dataset
file_path = 'C:\\Users\\DELL\\Downloads\\factuality_annotations_xsum_summaries.csv'
df = pd.read_csv(file_path)

# Step 1: Replace non-numeric values in 'is_factual' with numeric values
df['is_factual'] = df['is_factual'].replace({'yes': 1, 'no': 0})

# Step 2: Check for NaN values and drop them if necessary
df.dropna(subset=['is_factual'], inplace=True)

# Step 3: Convert the 'is_factual' column to integers (0 and 1)
df['is_factual'] = df['is_factual'].astype(int)

# Step 4: Check the class distribution
print("Original class distribution:")
print(df['is_factual'].value_counts())

# Step 5: Oversample the minority class
count_class_0, count_class_1 = df['is_factual'].value_counts()

# If class 0 is larger than class 1, oversample class 1
if count_class_0 > count_class_1:
    df_class_1 = df[df['is_factual'] == 1]
    df_class_0 = df[df['is_factual'] == 0]
    
    # Oversample class 1
    df_class_1_oversampled = df_class_1.sample(count_class_0, replace=True, random_state=42)

    # Combine majority class with oversampled minority class
    df_balanced = pd.concat([df_class_0, df_class_1_oversampled], axis=0)
else:
    df_balanced = df

# Step 6: Check the new class distribution
print("New class distribution after oversampling:")
print(df_balanced['is_factual'].value_counts())

# Proceed with your model training using df_balanced
# Example: Split your data into features (X) and target (y)
X = df_balanced['summary']  # or any other feature you want to use
y = df_balanced['is_factual']

In [None]:
import pandas as pd

# Load your dataset
file_path = 'C:\\Users\\DELL\\Downloads\\factuality_annotations_xsum_summaries.csv'
df = pd.read_csv(file_path)

# Replace 'yes' with 1 and 'no' with 0 in the 'is_factual' column
df['is_factual'] = df['is_factual'].replace({'yes': 1, 'no': 0})

# Drop rows with NaN values
df.dropna(inplace=True)

# Check class distribution
print("Original class distribution:")
print(df['is_factual'].value_counts())

# Oversample the minority class
from sklearn.utils import resample

# Separate the majority and minority classes
majority = df[df['is_factual'] == 0]
minority = df[df['is_factual'] == 1]

# Upsample minority class
minority_upsampled = resample(minority, 
                               replace=True,     # sample with replacement
                               n_samples=len(majority),    # to match majority class
                               random_state=42) # reproducible results

# Combine majority class with upsampled minority class
df_balanced = pd.concat([majority, minority_upsampled])

# Check new class distribution
print("New class distribution after oversampling:")
print(df_balanced['is_factual'].value_counts())

In [None]:
import re
from nltk.corpus import stopwords

# Ensure you have the necessary NLTK resources downloaded
import nltk
nltk.download('stopwords')

# Function to clean and preprocess text
def preprocess_text(text):
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()  # Convert to lowercase
    text = text.strip()  # Remove leading/trailing whitespace
    return text

# Replace 'summary' with the actual column name containing text data
df_balanced['cleaned_text'] = df_balanced['summary'].apply(preprocess_text)

# Display the first few rows
print(df_balanced[['summary', 'cleaned_text']].head())


In [None]:
from nltk.tokenize import word_tokenize

# Tokenization function
def tokenize_text(text):
    return word_tokenize(text)

# Apply tokenization
df_balanced['tokens'] = df_balanced['cleaned_text'].apply(tokenize_text)

# Display the first few rows
print(df_balanced[['cleaned_text', 'tokens']].head())

In [None]:
# Function to remove stopwords
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

# Apply stopword removal
df_balanced['tokens_no_stopwords'] = df_balanced['tokens'].apply(remove_stopwords)

# Display the first few rows
print(df_balanced[['tokens', 'tokens_no_stopwords']].head())

In [None]:
# Set a random seed for reproducibility
np.random.seed(42)

# Shuffle the dataset
df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)

# Define the split ratio
split_ratio = 0.8
split_index = int(split_ratio * len(df_balanced))

# Split into train and test sets
train_data = df_balanced[:split_index]
test_data = df_balanced[split_index:]

# Extract features and labels
X_train = train_data['cleaned_text']
y_train = train_data['is_factual']
X_test = test_data['cleaned_text']
y_test = test_data['is_factual']


In [None]:
def bag_of_words(texts):
    words = set()
    for text in texts:
        words.update(re.findall(r'\w+', text.lower()))
    word_to_index = {word: i for i, word in enumerate(words)}
    return word_to_index

# Creating features using Bag of Words
word_to_index = bag_of_words(X_train)
X_train_bow = np.zeros((len(X_train), len(word_to_index)))
for i, text in enumerate(X_train):
    for word in text.split():
        if word in word_to_index:
            X_train_bow[i, word_to_index[word]] += 1

# Add bias term
X_train_bow = np.hstack((np.ones((X_train_bow.shape[0], 1)), X_train_bow))

# Sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Logistic regression training
def train_logistic_regression(X, y, learning_rate=0.01, num_iterations=1000):
    m, n = X.shape
    weights = np.zeros(n)

    for _ in range(num_iterations):
        z = np.dot(X, weights)
        predictions = sigmoid(z)
        errors = predictions - y
        gradient = np.dot(X.T, errors) / m
        weights -= learning_rate * gradient

    return weights

# Train the model
weights = train_logistic_regression(X_train_bow, y_train)


# Create features for test data
X_test_bow = np.zeros((len(X_test), len(word_to_index)))
for i, text in enumerate(X_test):
    for word in text.split():
        if word in word_to_index:
            X_test_bow[i, word_to_index[word]] += 1

# Add bias term
X_test_bow = np.hstack((np.ones((X_test_bow.shape[0], 1)), X_test_bow))

# Predict function
def predict(X, weights):
    return np.round(sigmoid(np.dot(X, weights)))

# Make predictions
y_pred = predict(X_test_bow, weights)

# Calculate accuracy
accuracy = np.mean(y_pred == y_test)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Confusion matrix
def confusion_matrix(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    return np.array([[tn, fp], [fn, tp]])

cm = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(cm)

In [None]:
def grid_search_svm(X_train, y_train, C_values, kernel_values):
    best_accuracy = 0
    best_params = {}
    
    for C in C_values:
        for kernel in kernel_values:
            # Train SVM model
            svm_model = SVM(C=C, kernel=kernel)
            svm_model.fit(X_train, y_train)

            # Predict on training set
            y_pred = svm_model.predict(X_train)

            # Calculate accuracy
            accuracy = np.mean(y_pred == y_train)

            print(f'Kernel: {kernel}, C: {C}, Accuracy: {accuracy:.2f}')

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = {'C': C, 'kernel': kernel}
    
    print(f'Best parameters: {best_params}, Best accuracy: {best_accuracy:.2f}')
    return best_params



def evaluate_model(y_true, y_pred):
    accuracy = np.mean(y_true == y_pred)
    
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    print(f'Accuracy: {accuracy:.2%}')
    print(f'Precision: {precision:.2%}')
    print(f'Recall: {recall:.2%}')
    print(f'F1 Score: {f1_score:.2%}')

# Evaluate the model with predictions
evaluate_model(y_test, y_pred)


# Ensure y_true and y_pred are numpy arrays
y_true = np.array(y_test)
y_pred = np.array(y_pred)

def error_analysis(y_true, y_pred, original_indices, df):
    # Get the indices of misclassified examples
    errors = np.where(y_true != y_pred)[0]  # This gives indices of misclassified examples
    
    print(f"Total Misclassifications: {len(errors)}")
    if len(errors) == 0:
        print("No misclassifications found.")
        return

    print("Misclassified examples:")
    for error_index in errors[:5]:  # Show only the first 5 misclassifications
        print(f"Error index: {error_index}")

        # Access true and predicted labels using numpy array indexing
        if error_index < len(y_true) and error_index < len(y_pred):
            true_label = y_true[error_index]
            predicted_label = y_pred[error_index]

            # Access the original index
            if error_index < len(original_indices):
                original_index = original_indices[error_index]
                
                # Ensure the original index is valid for df
                if original_index < len(df):
                    text = df['cleaned_text'].iloc[original_index]  # Access cleaned_text
                    print(f'True: {true_label}, Predicted: {predicted_label}, Text: {text}')
                else:
                    print(f"Original index {original_index} out of bounds for cleaned_text.")
            else:
                print(f"Error index {error_index} is out of bounds for original_indices.")
        else:
            print(f"Error index {error_index} is out of bounds for true labels or predicted labels.")

# Call the error analysis function with the relevant arguments
error_analysis(y_true, y_pred, original_indices, df_balanced)
