In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy

In [2]:
df=pd.read_csv(r"D:\project 5\healthcare_reviews (1).csv")
df

Unnamed: 0,Review_Text,Rating
0,I have mixed feelings about my experience.,4
1,The staff was caring and attentive. I couldn't...,5
2,I have mixed feelings about my experience.,5
3,I have mixed feelings about my experience.,5
4,The healthcare provider was excellent. I had a...,3
...,...,...
995,My experience was terrible. I would not recomm...,5
996,The service was disappointing. I won't be comi...,4
997,"The service was okay, but nothing exceptional.",3
998,I have mixed feelings about my experience.,5


In [3]:
df['num_words']=df['Review_Text'].apply(lambda x:len(str(x).split()))

In [4]:
df['num_words']

0       7
1      10
2       7
3       7
4      10
       ..
995    10
996     9
997     7
998     7
999    10
Name: num_words, Length: 1000, dtype: int64

In [5]:
print('Maximum number of words',df['num_words'].max())
print('Sum number of words',df['num_words'].sum())
print('Minimum number of words',df['num_words'].min())

Maximum number of words 12
Sum number of words 8513
Minimum number of words 1


In [6]:
df.isnull().sum()

Review_Text    100
Rating           0
num_words        0
dtype: int64

In [7]:
df.fillna(0,inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Review_Text  1000 non-null   object
 1   Rating       1000 non-null   int64 
 2   num_words    1000 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 23.6+ KB


In [9]:
df.head()

Unnamed: 0,Review_Text,Rating,num_words
0,I have mixed feelings about my experience.,4,7
1,The staff was caring and attentive. I couldn't...,5,10
2,I have mixed feelings about my experience.,5,7
3,I have mixed feelings about my experience.,5,7
4,The healthcare provider was excellent. I had a...,3,10


In [10]:
df[['Rating','num_words']].corr()

Unnamed: 0,Rating,num_words
Rating,1.0,-0.033387
num_words,-0.033387,1.0


 the correlation coefficient of approximately -0.033 indicates that 'Rating' and 'num_words' are inversely related, but the relationship is very weak

In [11]:
df.Rating.value_counts()

Rating
4    223
5    211
2    209
1    189
3    168
Name: count, dtype: int64

In [12]:
df['Review_Text']=df['Review_Text'].astype(str)


In [13]:
import spacy
import pandas as pd
from joblib import Parallel, delayed

def clean_text(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    cleaned_text = [w.lemma_.lower().strip() for w in doc
                    if not (w.is_stop or w.is_punct or w.is_digit)]
    return ' '.join(cleaned_text)

def parallel_cleaning(df):
    cleaned_texts = Parallel(n_jobs=-1)(delayed(clean_text)(text) for text in df['Review_Text'])    
    df['Review_Text'] = cleaned_texts
    return df

# Assuming news is your DataFrame
cleaned_df = parallel_cleaning(df)


In [14]:
cleaned_df

Unnamed: 0,Review_Text,Rating,num_words
0,mixed feeling experience,4,7
1,staff care attentive happy,5,10
2,mixed feeling experience,5,7
3,mixed feeling experience,5,7
4,healthcare provider excellent great experience,3,10
...,...,...,...
995,experience terrible recommend provider,5,10
996,service disappointing will come,4,9
997,service okay exceptional,3,7
998,mixed feeling experience,5,7


In [15]:
cleaned_df['num_words']=cleaned_df['Review_Text'].apply(lambda x:len(str(x).split()))
cleaned_df

Unnamed: 0,Review_Text,Rating,num_words
0,mixed feeling experience,4,3
1,staff care attentive happy,5,4
2,mixed feeling experience,5,3
3,mixed feeling experience,5,3
4,healthcare provider excellent great experience,3,5
...,...,...,...
995,experience terrible recommend provider,5,4
996,service disappointing will come,4,4
997,service okay exceptional,3,3
998,mixed feeling experience,5,3


In [16]:
cleaned_df_2=cleaned_df.copy()

In [17]:
cleaned_df_2.head()

Unnamed: 0,Review_Text,Rating,num_words
0,mixed feeling experience,4,3
1,staff care attentive happy,5,4
2,mixed feeling experience,5,3
3,mixed feeling experience,5,3
4,healthcare provider excellent great experience,3,5


In [18]:
import pandas as pd
from textblob import TextBlob
import spacy

# Load the spaCy English model
nlp = spacy.load('en_core_web_sm')


# Function to calculate sentiment using TextBlob
def get_sentiment(text,threshold=0):
    blob = TextBlob(text)
    polarity= blob.sentiment.polarity
    if polarity>threshold:
        return 'Positive'
    elif polarity<-threshold:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to each review
cleaned_df_2['Sentiment'] = cleaned_df_2['Review_Text'].apply(get_sentiment,threshold=0.0)

# Display the result
(cleaned_df_2)


Unnamed: 0,Review_Text,Rating,num_words,Sentiment
0,mixed feeling experience,4,3,Neutral
1,staff care attentive happy,5,4,Positive
2,mixed feeling experience,5,3,Neutral
3,mixed feeling experience,5,3,Neutral
4,healthcare provider excellent great experience,3,5,Positive
...,...,...,...,...
995,experience terrible recommend provider,5,4,Negative
996,service disappointing will come,4,4,Negative
997,service okay exceptional,3,3,Positive
998,mixed feeling experience,5,3,Neutral


In [19]:
cleaned_df_2['Rating'].dtype

dtype('int64')

In [20]:
cleaned_df_2['Rating'] = cleaned_df_2['Rating'].astype(int)


In [21]:
# Create a new DataFrame with opposite sentiment and rating
opposite_df = cleaned_df_2[((cleaned_df_2['Sentiment'] == 'Positive') & (cleaned_df_2['Rating'] < 3)) | 
                 ((cleaned_df_2['Sentiment'] == 'Negative') & (cleaned_df_2['Rating'] > 3))]

# Display the new DataFrame
(opposite_df)

Unnamed: 0,Review_Text,Rating,num_words,Sentiment
10,service okay exceptional,1,3,Positive
17,experience terrible recommend provider,5,4,Negative
18,bad experience healthcare provider avoid possible,5,6,Negative
21,satisfied service receive highly recommend,1,5,Positive
22,service disappointing will come,5,4,Negative
...,...,...,...,...
985,staff care attentive happy,2,4,Positive
987,bad experience healthcare provider avoid possible,5,6,Negative
993,average experience good bad,4,4,Negative
995,experience terrible recommend provider,5,4,Negative


In [22]:
(opposite_df[opposite_df['Sentiment']=='Positive'])

Unnamed: 0,Review_Text,Rating,num_words,Sentiment
10,service okay exceptional,1,3,Positive
21,satisfied service receive highly recommend,1,5,Positive
37,service okay exceptional,2,3,Positive
47,healthcare provider excellent great experience,2,5,Positive
69,satisfied service receive highly recommend,2,5,Positive
...,...,...,...,...
975,staff care attentive happy,1,4,Positive
979,service okay exceptional,1,3,Positive
980,service okay exceptional,2,3,Positive
981,satisfied service receive highly recommend,2,5,Positive


In [23]:
(opposite_df[opposite_df['Sentiment']=='Negative'])


Unnamed: 0,Review_Text,Rating,num_words,Sentiment
17,experience terrible recommend provider,5,4,Negative
18,bad experience healthcare provider avoid possible,5,6,Negative
22,service disappointing will come,5,4,Negative
29,average experience good bad,4,4,Negative
36,average experience good bad,4,4,Negative
...,...,...,...,...
982,bad experience healthcare provider avoid possible,4,6,Negative
987,bad experience healthcare provider avoid possible,5,6,Negative
993,average experience good bad,4,4,Negative
995,experience terrible recommend provider,5,4,Negative


In [24]:
# Filter the DataFrame to exclude rows where Sentiment and Rating are opposite
filtered_df = cleaned_df_2[~(((cleaned_df_2['Sentiment'] == 'Positive') & (cleaned_df_2['Rating'] < 3)) | 
                   ((cleaned_df_2['Sentiment'] == 'Negative') & (cleaned_df_2['Rating'] > 3)))]

# Display the filtered DataFrame
(filtered_df)

Unnamed: 0,Review_Text,Rating,num_words,Sentiment
0,mixed feeling experience,4,3,Neutral
1,staff care attentive happy,5,4,Positive
2,mixed feeling experience,5,3,Neutral
3,mixed feeling experience,5,3,Neutral
4,healthcare provider excellent great experience,3,5,Positive
...,...,...,...,...
992,service disappointing will come,1,4,Negative
994,experience terrible recommend provider,1,4,Negative
997,service okay exceptional,3,3,Positive
998,mixed feeling experience,5,3,Neutral


In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix,classification_report,roc_curve,auc

In [26]:
filtered_df['Sentiment'].value_counts()

Sentiment
Positive    240
Negative    220
Neutral     198
Name: count, dtype: int64

In [28]:
tf=TfidfVectorizer(analyzer='word',ngram_range=(1,3),max_features=5000)
X=tf.fit_transform(filtered_df['Review_Text'])

In [29]:
X

<658x75 sparse matrix of type '<class 'numpy.float64'>'
	with 5268 stored elements in Compressed Sparse Row format>

In [30]:
print(X)

  (0, 43)	0.43268976309257456
  (0, 29)	0.43268976309257456
  (0, 42)	0.43268976309257456
  (0, 21)	0.2527802297935072
  (0, 28)	0.43268976309257456
  (0, 41)	0.43268976309257456
  (1, 12)	0.3333333333333333
  (1, 69)	0.3333333333333333
  (1, 1)	0.3333333333333333
  (1, 11)	0.3333333333333333
  (1, 68)	0.3333333333333333
  (1, 34)	0.3333333333333333
  (1, 0)	0.3333333333333333
  (1, 10)	0.3333333333333333
  (1, 67)	0.3333333333333333
  (2, 43)	0.43268976309257456
  (2, 29)	0.43268976309257456
  (2, 42)	0.43268976309257456
  (2, 21)	0.2527802297935072
  (2, 28)	0.43268976309257456
  (2, 41)	0.43268976309257456
  (3, 43)	0.43268976309257456
  (3, 29)	0.43268976309257456
  (3, 42)	0.43268976309257456
  (3, 21)	0.2527802297935072
  :	:
  (654, 55)	0.2900315499377171
  (654, 70)	0.36990306722553973
  (654, 47)	0.24894005461119936
  (654, 21)	0.18149621041147312
  (655, 64)	0.4284294781538596
  (655, 45)	0.4284294781538596
  (655, 63)	0.4284294781538596
  (655, 20)	0.4284294781538596
  (655,

In [31]:
y=filtered_df['Sentiment']
y

0       Neutral
1      Positive
2       Neutral
3       Neutral
4      Positive
         ...   
992    Negative
994    Negative
997    Positive
998     Neutral
999    Positive
Name: Sentiment, Length: 658, dtype: object

In [32]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=45)

In [33]:
nb=BernoulliNB()
nb.fit(X_train,y_train)

In [34]:
y_pred=nb.predict(X_test)

In [35]:
X_train.shape

(460, 75)

In [36]:
print('Confusion matrix')
print(confusion_matrix(y_pred,y_test))
print('-----------------------------')
print(classification_report(y_pred,y_test))

Confusion matrix
[[68  0  0]
 [ 0 56  0]
 [ 0  0 74]]
-----------------------------
              precision    recall  f1-score   support

    Negative       1.00      1.00      1.00        68
     Neutral       1.00      1.00      1.00        56
    Positive       1.00      1.00      1.00        74

    accuracy                           1.00       198
   macro avg       1.00      1.00      1.00       198
weighted avg       1.00      1.00      1.00       198

