In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load the dataset
csv_file_path = 'pizza.csv'  # Replace with your file path
data = pd.read_csv(csv_file_path)

# Preprocess the data (optional steps depending on your data)
# For example, handling missing values, data normalization, etc.

# Split the data into features (X) and target (y)
X = data['request_text']  # or 'request_text_edit_aware'
y = data['requester_received_pizza']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Replace missing values with an empty string
X_train = X_train.fillna('')
X_test = X_test.fillna('')

# Initialize a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=50000)  # You can adjust the number of features

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)


In [3]:
'''from sklearn.naive_bayes import MultinomialNB

# Initialize the model
model = MultinomialNB()'''

from sklearn.ensemble import RandomForestClassifier

# Initialize the model with class weights
model = RandomForestClassifier(class_weight='balanced', random_state=42)


In [4]:
# Train the model
model.fit(X_train_tfidf, y_train)


In [5]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)


Accuracy: 0.7400990099009901
Classification Report:
               precision    recall  f1-score   support

       False       0.74      1.00      0.85       597
        True       1.00      0.00      0.01       211

    accuracy                           0.74       808
   macro avg       0.87      0.50      0.43       808
weighted avg       0.81      0.74      0.63       808



In [6]:
# Predicting new data
new_request = ["I am a student and I am broke. I would love to have a pizza. I would gladly pay it forward. Thank you! I have been relying on the food bank and I am running out of food. If there is anyone out there who can help me, I would appreciate it. I love puppies and kittens."]
new_request_tfidf = vectorizer.transform(new_request)
prediction = model.predict(new_request_tfidf)

print(prediction)
print("Prediction:", "Success" if prediction[0] else "Failure")


[False]
Prediction: Failure


In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

data['request_text'] = data['request_text'].astype(str)

texts = data['request_text'].tolist()




In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Convert texts to sequences of integers
sequences = tokenizer.texts_to_sequences(texts)

# Pad sequences to ensure uniform length
max_sequence_length = max([len(x) for x in sequences])
sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')


In [9]:
# Assuming 'sequences' is your list of token sequences
sequences = np.array(sequences)

# X will be all tokens except the last
X = sequences[:, :-1]

# y will be the last token in each sequence
y = sequences[:, -1]


In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=64, input_length=873))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(len(tokenizer.word_index) + 1, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')


In [11]:
from tensorflow.keras.utils import to_categorical

# The number of unique words in your dataset
vocabulary_size = len(tokenizer.word_index) + 1


# One-hot encode y
y = to_categorical(y, num_classes=vocabulary_size)


In [15]:
# Prepare your input and output data (X and y) for training

model.fit(X, y, epochs=1, batch_size=32)




<keras.callbacks.History at 0x22dc1875570>

In [30]:
'''def generate_text(seed_text, num_words):
    for _ in range(num_words):
        sequence = tokenizer.texts_to_sequences([seed_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_sequence_length, padding='post')
        
        predicted = model.predict_classes(sequence, verbose=0)[0]
        output_word = ""
        
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text'''

def generate_text(seed_text, num_words, model, tokenizer, max_sequence_length):
    for _ in range(num_words):
        # Convert the current seed text to a token sequence
        sequence = tokenizer.texts_to_sequences([seed_text])[0]
        # Pad the sequence
        sequence = pad_sequences([sequence], maxlen=873, padding='post')
        # Predict the next token (word)
        predictions = model.predict(sequence, verbose=0)[0]
        # Get the index of the highest probability prediction
        predicted_index = np.argmax(predictions)
        output_word = ""
        # Map the predicted index to the corresponding word
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text




In [31]:
# Example usage
generated_text = generate_text("I would like", 10, model, tokenizer, 873)
print(generated_text)

I would like          
