In [27]:
import pandas as pd

# Initialize a list to store data
data = []

# Read the file and process each line
with open('data.txt', 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if line:
            # Check if 'Answer: ' is in the line
            if ' Answer: ' in line:
                # Split the line into question and answer
                question, answer = line.split(' Answer: ', 1)
                data.append({'question': question, 'answer': answer})
            else:
                print(f"Line skipped (delimiter missing): {line}")

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df.head())

# Save to CSV if needed
df.to_csv('questions_and_answers.csv', index=False)


Line skipped (delimiter missing): Question: Who is the Leader of the Opposition in the Lok Sabha (Lower House of Parliament) in India?
                                        question                answer
0       Question: What is the capital of France?                Paris.
1        Question: Who wrote “Romeo and Juliet”?  William Shakespeare.
2  Question: What is the largest ocean on Earth?        Pacific Ocean.
3  Question: What is the tallest mammal on land?              Giraffe.
4           Question: Who painted the Mona Lisa?    Leonardo da Vinci.


In [21]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Load your data
df = pd.read_csv('questions_and_answers.csv')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['question'], df['answer'], test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF and Naive Bayes
model = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Define parameter grid for GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],  # Unigrams or bigrams
    'clf__alpha': [0.1, 1, 10]  # Smoothing parameter for Naive Bayes
}

# Use StratifiedKFold for cross-validation
cv = StratifiedKFold(n_splits=3)  # or a suitable number

# Create GridSearchCV object
grid_search = GridSearchCV(model, parameters, cv=cv, n_jobs=-1, verbose=1)

# Train the model with GridSearchCV
grid_search.fit(X_train, y_train)

# Print best parameters
print("Best parameters found:", grid_search.best_params_)

# Evaluate the model on the test set
y_pred = grid_search.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2f}")

# Save the best model
joblib.dump(grid_search.best_estimator_, 'qa_retrieval_model.pkl')

print("Model saved as 'qa_retrieval_model.pkl'")


Fitting 3 folds for each of 6 candidates, totalling 18 fits


ValueError: n_splits=3 cannot be greater than the number of members in each class.

In [16]:

import joblib

# Load the trained model
model = joblib.load('qa_retrieval_model.pkl')

# Function to ask a question and get an answer
def ask_question(question):
    # Predict the answer
    predicted_answer = model.predict([question])[0]
    return predicted_answer

# Example question
input_question = "What is java?"

# Get the answer
answer = ask_question(input_question)

print(f"Question: {input_question}")
print(f"Answer: {answer}")



Question: What is java?
Answer: An abstract class is a class that cannot be instantiated and may contain abstract methods (methods without implementations) that must be implemented by subclasses.


In [17]:
from sklearn.metrics import accuracy_score

# Accuracy requires binary or multiclass classification, not for text. For text classification, you might use exact match accuracy.
correct_predictions = [pred == ans for pred, ans in zip(answer, "Pariss" )]
accuracy = sum(correct_predictions) / len(correct_predictions)

print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.17


In [22]:
import pandas as pd

# Load data from 'data.txt'
data_file_path = 'data.txt'
data = pd.read_csv(data_file_path, delimiter='\t', header=None, names=['question', 'answer'])

# Save the DataFrame to a CSV file
csv_file_path = 'data.csv'
data.to_csv(csv_file_path, index=False)

print(f"Data successfully converted to {csv_file_path}")


Data successfully converted to data.csv


In [23]:
import pandas as pd

# Load data from 'data.csv'
csv_file_path = 'data.csv'
data = pd.read_csv(csv_file_path)

# Check the first few rows of the data
print(data.head())


                                            question  answer
0  Question: What is Java? Answer: Java is a high...     NaN
1  Question: What is the primary use of the Sprin...     NaN
2  Question: What is a Java class? Answer: A Java...     NaN
3  Question: What is dependency injection in Spri...     NaN
4  Question: What is the difference between == an...     NaN


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, ShuffleSplit
from sklearn.metrics import classification_report, accuracy_score
import joblib

# Prepare the data
X = df['question']
y = df['answer']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF and Naive Bayes
model = Pipeline([
    ('vect', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Define parameter grid for GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],  # Unigrams or bigrams
    'clf__alpha': [0.1, 1, 10]  # Smoothing parameter for Naive Bayes
}

# Use ShuffleSplit for cross-validation
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

# Create GridSearchCV object
grid_search = GridSearchCV(model, parameters, cv=cv, n_jobs=-1, verbose=1)

# Train the model with GridSearchCV
grid_search.fit(X_train, y_train)

# Print best parameters
print("Best parameters found:", grid_search.best_params_)

# Evaluate the model on the test set
y_pred = grid_search.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))

print(f"Accuracy Score: {accuracy_score(y_test, y_pred):.2f}")

# Save the best model
joblib.dump(grid_search.best_estimator_, 'qa_retrieval_model.pkl')

print("Model saved as 'qa_retrieval_model_new.pkl'")


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters found: {'clf__alpha': 0.1, 'vect__ngram_range': (1, 1)}
Classification Report:
                    precision    recall  f1-score   support

          5 years.       0.00      0.00      0.00         1
          6 years.       0.00      0.00      0.00         1
       Blue whale.       0.00      0.00      0.00         0
         Canberra.       0.00      0.00      0.00         0
      Giant panda.       0.00      0.00      0.00         1
       Harper Lee.       0.00      0.00      0.00         0
     Indian rupee.       0.00      0.00      0.00         0
     J.K. Rowling.       0.00      0.00      0.00         1
   J.R.R. Tolkien.       0.00      0.00      0.00         0
         Kangaroo.       0.00      0.00      0.00         0
M. Venkaiah Naidu.       0.00      0.00      0.00         0
      Marie Curie.       0.00      0.00      0.00         1
           Moscow.       0.00      0.00      0.00         1
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
