<a href="https://colab.research.google.com/github/Rituchoudhary67/12345/blob/main/Untitled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Load the dataset from csv
data = pd.read_csv("NSEI data set.csv")

# Debug: Print available columns
print("Columns in dataset:", data.columns.tolist())

# Extract 'Date' and 'Volume' if they exist
if 'Date' in data.columns:
    D = data['Date']
else:
    print("Column 'Date' not found in dataset.")
    D = None

if 'Volume' in data.columns:
    V = data['Volume']
else:
    print("Column 'Volume' not found in dataset.")
    V = None

# Create a DataFrame from the data
df = pd.DataFrame(data)

# Step 1: Preprocess the data
# Drop unwanted columns if they exist
drop_columns = ['Date', 'Volume', 'TP', 'Rule1', 'Rule2', 'Rule3']
existing_drop_columns = [col for col in drop_columns if col in df.columns]
df = df.drop(columns=existing_drop_columns)

# Convert numeric columns to float after removing commas
numeric_columns = ['Open', 'High', 'Low', 'Close']
for col in numeric_columns:
    if col in df.columns:
        df[col] = df[col].replace({",": ""}, regex=True).astype(float)
    else:
        print(f"Column '{col}' not found in dataset.")

# Handle missing values by forward filling
df.fillna(method="ffill", inplace=True)

# Ensure the target variable 'Classifier' exists and is of type int
if 'Classifier' in df.columns:
    df['Classifier'] = df['Classifier'].astype(int)
else:
    raise KeyError("Column 'Classifier' not found in dataset.")

# Step 2: Define features and target variable
X = df.drop(columns=['Classifier'])  # Features
y = df['Classifier']                 # Target variable

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Step 3: Initialize classifiers
knn = KNeighborsClassifier(n_neighbors=3)
decision_tree = DecisionTreeClassifier(random_state=42)
logreg = LogisticRegression(solver='liblinear', random_state=42)

# Step 4: Train the models
knn.fit(X_train, y_train)
decision_tree.fit(X_train, y_train)
logreg.fit(X_train, y_train)

# Step 5: Make predictions on the test set
y_pred_knn = knn.predict(X_test)
y_pred_dt = decision_tree.predict(X_test)
y_pred_logreg = logreg.predict(X_test)

# Calculate and print accuracy scores
print("Accuracy by KNN  :", accuracy_score(y_test, y_pred_knn))
print("Accuracy by DT   :", accuracy_score(y_test, y_pred_dt))
print("Accuracy by LogReg:", accuracy_score(y_test, y_pred_logreg))

# Step 6: Insert predictions into the test dataset
df_test = X_test.copy()
df_test['KNN_Pred'] = y_pred_knn
df_test['DT_Pred'] = y_pred_dt
df_test['LogReg_Pred'] = y_pred_logreg

# If the original data contained a 'Date' column, merge it back (using indices)
if D is not None:
    df_test = df_test.merge(data[['Date']], left_index=True, right_index=True)

# If you also wish to include the original 'Volume' column, uncomment the following:
# if V is not None:
#     df_test = df_test.merge(data[['Volume']], left_index=True, right_index=True)

# Save the updated DataFrame with predictions to a CSV file
df_test.to_csv("NSEI_3C_Predictions.csv", index=False)


Columns in dataset: ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Rule1', 'Rule2', 'TP', 'Rule3', 'Classifier']
Accuracy by KNN  : 0.7250293772032902
Accuracy by DT   : 0.6850763807285546
Accuracy by LogReg: 0.782608695652174


  df.fillna(method="ffill", inplace=True)


In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Load dataset
data = pd.read_csv("NSEI data set.csv")
df = pd.DataFrame(data)

# Display missing values before handling
print("Missing values in the dataset:\n", df.isnull().sum())

# Drop unnecessary columns based on training data
drop_columns = ['Date', 'Volume', 'TP', 'Rule1', 'Rule2', 'Rule3']
df.drop(columns=[col for col in drop_columns if col in df.columns], inplace=True)

# Convert numeric columns after removing commas
numeric_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close']
for col in numeric_columns:
    if col in df.columns:
        df[col] = df[col].replace({",": ""}, regex=True).astype(float)

# Fill missing values using forward fill
df.ffill(inplace=True)

# Display missing values after handling
print("Missing values after preprocessing:\n", df.isnull().sum())

# Ensure target column 'Classifier' exists
if 'Classifier' not in df.columns:
    raise KeyError("Column 'Classifier' not found in dataset.")

# Define features (X) and target variable (y)
X = df.drop(columns=['Classifier'])
y = df['Classifier'].astype(int)  # Ensure correct datatype

# Initialize models
models = {
    'KNN': KNeighborsClassifier(n_neighbors=3),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(solver='liblinear', random_state=42)
}

# Function to train and test models
def train_and_test(X, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

    results = {}
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Compute evaluation metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
        recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
        f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
        confusion = confusion_matrix(y_test, y_pred)

        # Store results
        results[model_name] = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'Confusion Matrix': confusion
        }

        print(f"\n{model_name} Model Results (Test Split {int(test_size * 100)}%):")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"Confusion Matrix:\n{confusion}")
        print(f"Classification Report:\n{classification_report(y_test, y_pred)}")

    return results

# Run tests for multiple split ratios
split_ratios = [0.40, 0.30, 0.20]
for test_size in split_ratios:
    train_and_test(X, y, test_size)

Missing values in the dataset:
 Date          0
Open          0
High          0
Low           0
Close         0
Volume        0
Rule1         0
Rule2         0
TP            0
Rule3         0
Classifier    0
dtype: int64
Missing values after preprocessing:
 Open          0
High          0
Low           0
Close         0
Classifier    0
dtype: int64

KNN Model Results (Test Split 40%):
Accuracy: 0.6868
Precision: 0.6840
Recall: 0.6821
F1 Score: 0.6827
Confusion Matrix:
[[487 286]
 [247 682]]
Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.63      0.65       773
           1       0.70      0.73      0.72       929

    accuracy                           0.69      1702
   macro avg       0.68      0.68      0.68      1702
weighted avg       0.69      0.69      0.69      1702


Decision Tree Model Results (Test Split 40%):
Accuracy: 0.6563
Precision: 0.6537
Recall: 0.6541
F1 Score: 0.6538
Confusion Matrix:
[[487 286]
 [299 630]]