In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [62]:
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [63]:
review1 = pd.read_csv("data.csv")
review1.head()

Unnamed: 0,Reviewer Name,Review Title,Place of Review,Up Votes,Down Votes,Month,Review text,Ratings
0,Kamal Suresh,Nice product,"Certified Buyer, Chirakkal",889.0,64.0,Feb 2021,"Nice product, good quality, but price is now r...",4
1,Flipkart Customer,Don't waste your money,"Certified Buyer, Hyderabad",109.0,6.0,Feb 2021,They didn't supplied Yonex Mavis 350. Outside ...,1
2,A. S. Raja Srinivasan,Did not meet expectations,"Certified Buyer, Dharmapuri",42.0,3.0,Apr 2021,Worst product. Damaged shuttlecocks packed in ...,1
3,Suresh Narayanasamy,Fair,"Certified Buyer, Chennai",25.0,1.0,,"Quite O. K. , but nowadays the quality of the...",3
4,ASHIK P A,Over priced,,147.0,24.0,Apr 2016,Over pricedJust â?¹620 ..from retailer.I didn'...,1


In [64]:
review1.drop_duplicates(inplace=True)

In [65]:
review1.dropna(inplace=True)

In [66]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+','',text)
    text = re.sub(r'[^\w\s]','',text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    text = ' '.join(words).strip()
    return text

In [67]:
review1['cleaned_text']=review1['Review text'].apply(clean_text)

In [68]:
cleaned_df = review1

In [69]:
def assign_ratings(ratings):
    if ratings == 5 or ratings == 4:
        return "Positive"
    else:
        return "Negative"


cleaned_df["Target"]=cleaned_df["Ratings"].apply(assign_ratings)

In [70]:
cleaned_df['Target'] = cleaned_df['Target'].replace({'Positive':1,'Negative':0}, regex=True)

  cleaned_df['Target'] = cleaned_df['Target'].replace({'Positive':1,'Negative':0}, regex=True)


In [128]:
import os
os.environ["PREFECT_API_URL"] = "ephemeral"



In [129]:
from prefect import task, flow
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [130]:
df=cleaned_df[["cleaned_text","Target"]]

In [131]:
df.head()

Unnamed: 0,cleaned_text,Target
0,nice product good quality price rising bad sig...,1
1,didnt supplied yonex mavis outside cover yonex...,0
2,worst product damaged shuttlecock packed new b...,0
5,good quality product delivered timeread,1
6,best purchase good quality durable average shu...,1


In [132]:
df.shape

(8013, 2)

In [133]:
def load_data(data):
    sentiment=pd.DataFrame(data)
    return sentiment

In [134]:
# load_data(df)

In [135]:
def splits_input_output(data, inputs, outputs):
    x=data[inputs]
    y=data[outputs]
    return x, y

In [136]:
# splits_input_output(df, "cleaned_text", "Target")

In [137]:
def split_train_test(x, y, test_size=0.2, random_state=42):
    return train_test_split(x, y, test_size=test_size, random_state=42)

In [138]:
def vectorize_text(x_train, x_test):
    tfidf = TfidfVectorizer()
    x_train_tfidf = tfidf.fit_transform(x_train)
    x_test_tfidf = tfidf.transform(x_test)
    return x_train_tfidf, x_test_tfidf, tfidf

In [139]:
def train_model(x_train, y_train, hyperparameters):
    rf=RandomForestClassifier(**hyperparameters)
    rf.fit(x_train, y_train)
    return rf

In [140]:
def evaluate_model(model, x_train, x_test, y_train, y_test):
    y_train_pred=model.predict(x_train)
    y_test_pred=model.predict(x_test)

    train_score = metrics.accuracy_score(y_train, y_train_pred)
    test_score = metrics.accuracy_score(y_test, y_test_pred)
    return train_score, test_score

In [141]:
def workflow(data):
    inputs = "cleaned_text"
    outputs = "Target"
    hyperparameters = {"min_samples_split": 5, "n_estimators": 300}

    # load data
    dataset = load_data(data)
    
    # splitting input and output
    x, y = splits_input_output(dataset, inputs, outputs)
    
    # train test split 
    x_train, x_test, y_train, y_test = split_train_test(x, y)

    # vectorize text 
    x_train_tfidf, x_test_tfidf, tfidf = vectorize_text(x_train, x_test)
    
    # train model
    model = train_model(x_train_tfidf, y_train, hyperparameters)
    
    # evaluate
    train_score, test_score = evaluate_model(model, x_train_tfidf, x_test_tfidf, y_train, y_test)

    print("Training Accuracy:", train_score)
    print("Testing Accuracy:", test_score)


In [142]:
if __name__ == "__main__":
    workflow(df)

Training Accuracy: 0.9522620904836193
Testing Accuracy: 0.8858390517779164


In [143]:
@task
def load_data(data):
    sentiment=pd.DataFrame(data)
    return sentiment

In [144]:
@task
def splits_input_output(data, inputs, outputs):
    x=data[inputs]
    y=data[outputs]
    return x, y

In [145]:
@task
def split_train_test(x, y, test_size=0.2, random_state=42):
    return train_test_split(x, y, test_size=test_size, random_state=42)

In [146]:
@task
def vectorize_text(x_train, x_test):
    tfidf = TfidfVectorizer()
    x_train_tfidf = tfidf.fit_transform(x_train)
    x_test_tfidf = tfidf.transform(x_test)
    return x_train_tfidf, x_test_tfidf, tfidf

In [147]:
@task
def train_model(x_train, y_train, hyperparameters):
    rf=RandomForestClassifier(**hyperparameters)
    rf.fit(x_train, y_train)
    return rf

In [148]:
@task
def evaluate_model(model, x_train, x_test, y_train, y_test):
    y_train_pred=model.predict(x_train)
    y_test_pred=model.predict(x_test)

    train_score = metrics.accuracy_score(y_train, y_train_pred)
    test_score = metrics.accuracy_score(y_test, y_test_pred)
    return train_score, test_score

In [149]:
import prefect
print(prefect.__version__)

import prefect.task_runners as tr
print(dir(tr))


3.4.11
['Any', 'ConcurrentTaskRunner', 'Coroutine', 'F', 'Generic', 'Iterable', 'MappingLengthMismatch', 'MappingMissingIterable', 'P', 'PREFECT_TASK_RUNNER_THREAD_POOL_MAX_WORKERS', 'ParamSpec', 'PrefectConcurrentFuture', 'PrefectDistributedFuture', 'PrefectFuture', 'PrefectFutureList', 'PrefectTaskRunner', 'R', 'RunInput', 'Self', 'T', 'TYPE_CHECKING', 'TaskRunner', 'ThreadPoolExecutor', 'ThreadPoolTaskRunner', 'TypeVar', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'abc', 'allow_failure', 'annotations', 'asyncio', 'collapse_variadic_parameters', 'copy_context', 'explode_variadic_parameter', 'get_logger', 'get_parameter_defaults', 'get_run_logger', 'isiterable', 'overload', 'quote', 'sys', 'threading', 'unmapped', 'uuid', 'uuid7']


In [150]:
from prefect.task_runners import ThreadPoolTaskRunner

@flow(name="rf_training_flow", task_runner=ThreadPoolTaskRunner())
def workflow():
    inputs = "cleaned_text"
    outputs = "Target"
    hyperparameters = {"min_samples_split": 5, "n_estimators": 300}

    # load data
    dataset = load_data(df)
    
    # splitting input and output
    x, y = splits_input_output(dataset, inputs, outputs)
    
    # train test split 
    x_train, x_test, y_train, y_test = split_train_test(x, y)

    # vectorize text 
    x_train_tfidf, x_test_tfidf, tfidf = vectorize_text(x_train, x_test)
    
    # train model
    model = train_model(x_train_tfidf, y_train, hyperparameters)
    
    # evaluate
    train_score, test_score = evaluate_model(model, x_train_tfidf, x_test_tfidf, y_train, y_test)

    print("Training Accuracy:", train_score)
    print("Testing Accuracy:", test_score)


In [151]:
if __name__ == "__main__":
    workflow()

RuntimeError: Failed to reach API at http://127.0.0.1:4200/api/