In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from joblib import dump
from joblib import load


In [22]:
# Load data from a CSV file
def load_data_from_test(file_path):
    data = pd.read_csv(file_path)
    X = data['input'].values
    return X


# Load data from an csv file
def load_data_from_csv(file_path):
    data = pd.read_csv(file_path)
    X = data['input'].values
    y = data['labels'].values
    return X, y

# Load the trained model
def load_model(file_path):
    return load(file_path)

# Feature extraction (TF-IDF)
def extract_features(X_train, X_test):
    vectorizer = TfidfVectorizer()
    X_train_vectorized = vectorizer.fit_transform(X_train)
    X_test_vectorized = vectorizer.transform(X_test)
    return X_train_vectorized, X_test_vectorized, vectorizer

# Train the model
def train_model(X_train_vectorized, y_train):
    svm_model = SVC(kernel='linear')
    svm_model.fit(X_train_vectorized, y_train)
    return svm_model

# Evaluate the model
def evaluate_model(model, X_test_vectorized, y_test):
    y_pred = model.predict(X_test_vectorized)
    print(classification_report(y_test, y_pred))

# Save the trained model
def save_model(model, vectorizer, file_path):
    dump({'model': model, 'vectorizer': vectorizer}, file_path)


In [25]:
# Main function
def main():
    # Load data from CSV files
    train_data_file = '/content/train_data.csv'
    val_data_file = '/content/val_data.csv'
    X_train, y_train = load_data_from_csv(train_data_file)
    X_val, y_val = load_data_from_csv(val_data_file)

    # Load test data from Excel file
    test_data_file = '/content/test_data.csv'
    X_test = load_data_from_test(test_data_file)


     # Feature extraction
    X_train_vectorized, X_val_vectorized, vectorizer = extract_features(X_train, X_val)

    # Train the model
    model = train_model(X_train_vectorized, y_train)

    # Evaluate the model on the validation set
    print("Validation Set Evaluation:")
    evaluate_model(model, X_val_vectorized, y_val)

    # Vectorize the test sentences
    X_test_vectorized = vectorizer.transform(X_test)

    # Predict labels for test sentences
    y_pred = model.predict(X_test_vectorized)


    # Create DataFrame with predicted output
    output_df = pd.DataFrame({'Sentence': X_test, 'Predicted Label': y_pred})

    # Save predicted output to a CSV file
    output_file = 'predicted_output.csv'
    output_df.to_csv(output_file, index=False)
    print(f"Predicted output saved to {output_file}")

    # Save the trained model
    model_file = 'grammatical_error_detector.joblib'
    save_model(model, vectorizer, model_file)
    print(f"Trained model saved to {model_file}")



if __name__ == "__main__":
    main()

Validation Set Evaluation:
              precision    recall  f1-score   support

           0       0.41      0.41      0.41      5000
           1       0.40      0.40      0.40      5000

    accuracy                           0.41     10000
   macro avg       0.40      0.41      0.40     10000
weighted avg       0.40      0.41      0.40     10000

Predicted output saved to predicted_output.csv
Trained model saved to grammatical_error_detector.joblib
