In [49]:
import pandas as pd

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import feature_extraction 
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [51]:
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [52]:
df = pd.read_excel("./OBGYN_new_train_80000.xlsx")
df2 = pd.read_excel("./OBGYN_new_test_withoutAnswer_20000_2024.xlsx")

In [53]:
def preprocessText(text_lower):
    text_lower = text_lower.lower()
    translator = str.maketrans('', '', string.punctuation)
    text_lower = text_lower.translate(translator)
    words = text_lower.split()
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [54]:
train_df = df
train_df['review'] = train_df['review'].apply(preprocessText)
train_df.head()

Unnamed: 0,reviewID,doctorID,doctorName,specialty,numReviews,city,state,doctorHomepage,averageRating,staff,punctuality,helpfulness,knowledge,postedTime,review,review_helpful_count
0,1,2320644,Dr. Kevin G. Fahey,Gynecologist (OBGYN),2,New Haven,MI,/doctor-ratings/2320644/Dr-KEVIN%2BG.-FAHEY-Ne...,4.25,4,3,5,5,2013-06-12 16:00:00,went doctor close house great share office doc...,0
1,2,961169,Dr. Sudha R. Nair,Gynecologist (OBGYN),5,Knoxville,TN,/doctor-ratings/961169/Dr-Sudha%2BR.-Nair-Knox...,2.0,2,3,2,1,2010-03-21 16:36:00,unprofessional knowledgeable office surgical p...,0
2,3,876934,Dr. Bonnie Gong,Gynecologist (OBGYN),6,Kirkland,WA,/doctor-ratings/876934/Dr-Bonnie-Gong-Kirkland...,4.0,3,3,5,5,2010-08-12 13:08:00,doctor leave practice due illness disappointed...,0
3,4,102625,Dr. Louann Turner,Gynecologist (OBGYN),6,Suffolk,VA,/doctor-ratings/102625/Dr-Louann-Turner-Suffol...,4.5,4,4,5,5,2009-07-23 12:09:00,wonderful take time tell exactly thing term un...,0
4,5,42933,Dr. Michael A. Benson,Gynecologist (OBGYN),21,Staten Island,NY,/doctor-ratings/42933/Dr-Michael%2BA.-Benson-S...,5.0,5,5,5,5,2008-03-14 16:22:00,excellent doctor caring considerate excellent ...,0


In [55]:
test_df = df2
test_df['review'] = test_df['review'].apply(preprocessText)
test_df.head()

Unnamed: 0,reviewID,doctorID,doctorName,specialty,numReviews,city,state,doctorHomepage,postedTime,review,review_helpful_count
0,80001,106417,Dr. Daniel E. Bahnmiller,Gynecologist (OBGYN),22,Richland,WA,/doctor-ratings/106417/Dr-Daniel%2BE.-Bahnmill...,2010-09-10 22:00:00,friendly helpful staff doctor doctor listens a...,0
1,80002,31663,Dr. Donald P. Ward,Gynecologist (OBGYN),8,Austin,TX,/doctor-ratings/31663/Dr-Donald%2BP.-Ward-Aust...,2007-01-16 13:13:00,dr ward professional time kind thoughtful gent...,0
2,80003,2244002,Dr. James R. Gullett,Gynecologist (OBGYN),2,Houston,TX,/doctor-ratings/2244002/Dr-JAMES%2BR.-GULLETT-...,2013-05-26 08:55:00,dr gullett excellent doctor staff great especi...,0
3,80004,16966,Dr. Henry Y. Su,Gynecologist (OBGYN),17,HOUSTON,TX,/doctor-ratings/16966/Dr-Henry%2BY.-Su-HOUSTON...,2009-05-22 13:57:00,seems really nice partnred w dr mundy doctor d...,0
4,80005,919,Dr. Judith A. Gurdian,Gynecologist (OBGYN),38,ROCKVILLE,MD,/doctor-ratings/919/Dr-Judith%2BA.-Gurdian-ROC...,2012-04-13 16:56:00,wonderful doctor high risk patient rare bleedi...,0


In [56]:
tfidf_vectorizer = TfidfVectorizer(max_features=3000,ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['review'])
y = (train_df['knowledge'] == 5).astype(int)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, y, test_size=0.2, random_state=42)

In [58]:
model = LogisticRegression()

In [59]:
model.fit(X_train, y_train)

In [60]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9210625


In [61]:
test_tfidf = tfidf_vectorizer.transform(test_df['review'])

In [62]:
prediction = model.predict(test_tfidf)
test_df['prediction'] = prediction

In [63]:
test_df.head()

Unnamed: 0,reviewID,doctorID,doctorName,specialty,numReviews,city,state,doctorHomepage,postedTime,review,review_helpful_count,prediction
0,80001,106417,Dr. Daniel E. Bahnmiller,Gynecologist (OBGYN),22,Richland,WA,/doctor-ratings/106417/Dr-Daniel%2BE.-Bahnmill...,2010-09-10 22:00:00,friendly helpful staff doctor doctor listens a...,0,1
1,80002,31663,Dr. Donald P. Ward,Gynecologist (OBGYN),8,Austin,TX,/doctor-ratings/31663/Dr-Donald%2BP.-Ward-Aust...,2007-01-16 13:13:00,dr ward professional time kind thoughtful gent...,0,1
2,80003,2244002,Dr. James R. Gullett,Gynecologist (OBGYN),2,Houston,TX,/doctor-ratings/2244002/Dr-JAMES%2BR.-GULLETT-...,2013-05-26 08:55:00,dr gullett excellent doctor staff great especi...,0,1
3,80004,16966,Dr. Henry Y. Su,Gynecologist (OBGYN),17,HOUSTON,TX,/doctor-ratings/16966/Dr-Henry%2BY.-Su-HOUSTON...,2009-05-22 13:57:00,seems really nice partnred w dr mundy doctor d...,0,1
4,80005,919,Dr. Judith A. Gurdian,Gynecologist (OBGYN),38,ROCKVILLE,MD,/doctor-ratings/919/Dr-Judith%2BA.-Gurdian-ROC...,2012-04-13 16:56:00,wonderful doctor high risk patient rare bleedi...,0,1


In [64]:
submission = test_df[['reviewID','prediction']]
submission.head()

Unnamed: 0,reviewID,prediction
0,80001,1
1,80002,1
2,80003,1
3,80004,1
4,80005,1


In [65]:
submission.to_csv('32641622_wolverine.csv',index=False)