In [7]:
import pandas as pd
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import classification_report, accuracy_score


df = pd.read_csv('test.csv', encoding='latin1') #UTF-8 isn't working 
#-------------------------------------------------------------------------
#Alternative 
#encodings = ['utf-8', 'ISO-8859-1', 'latin1', 'cp1252']
#for enc in encodings:
# try:
#  data = pd.read_csv('test.csv', encoding=enc)
#  break
#except UnicodeDecodeError as e:
    #    print("Failed to load")
#--------------------------------------------------------------------------

X = df['description']
y = df['test_cases']
expected_outputs = df[['test_cases', 'expected_output']]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

parameters = {
    'tfidf__ngram_range': [(1, 1), (1, 2)], #tdidfparameter
    'clf__C': [0.1, 1, 10] #logisticparameter
}

# Create a pipeline with TF-IDF and Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=1000))
])
#--------------------------------------------------------------
#train
cv = KFold(n_splits=3, shuffle=True, random_state=42)

grid_search = GridSearchCV(pipeline, parameters, cv=cv, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

print("Best parameters found:", grid_search.best_params_)

#----------------------------------------------------------------
#Test
y_pred = grid_search.predict(X_test)

print(classification_report(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {accuracy:.2f}")

joblib.dump(grid_search.best_estimator_, 'text_classification_model_testcases.pkl')

def predict_and_get_expected_output(description):
    predicted_test_case = grid_search.predict([description])[0]
    expected_output_row = expected_outputs[expected_outputs['test_cases'] == predicted_test_case]
    expected_output = expected_output_row['expected_output'].values[0] if not expected_output_row.empty else "Expected output not found"
    return predicted_test_case, expected_output

example_description = "I want to place an order so suggest"
predicted_test_case, expected_output = predict_and_get_expected_output(example_description)
print(f"Predicted Test Case for '{example_description}':\n{predicted_test_case}")
print(f"Expected Output for '{predicted_test_case}':\n{expected_output}")



Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best parameters found: {'clf__C': 1, 'tfidf__ngram_range': (1, 1)}
                                                                                                            precision    recall  f1-score   support

             1. Access and handle personal data <br> 2. Verify compliance with data protection regulations       0.00      0.00      0.00         0
            1. Access and manage policies online <br> 2. Verify correct functionality and data consistency       0.00      0.00      0.00         1
               1. Access application from different time zones <br> 2. Verify time-related functionalities       0.00      0.00      0.00         0
                       1. Access application in different browsers <br> 2. Verify functionality and layout       0.00      0.00      0.00         1
  1. Access application on devices with various screen resolutions <br> 2. Verify layout and functionality       0.00      0.00     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fitting 3 folds for each of 6 candidates, totalling 18 fits
Predicted Response: Test Steps Expected Output
