<a href="https://colab.research.google.com/github/Sedacoder/NLP/blob/main/sentiment_analysis_hotel_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U nltk
import nltk
import pandas as pd
import time
print("Loading data now.This could take a while depending on file size")
start = time.time()
df = pd.read_csv('Hotel_Reviews.csv')
end = time.time()
print("Loading took" + str(round(end - start, 2))+"seconds")

Loading data now.This could take a while depending on file size
Loading took2.84seconds


In [None]:
print(df.shape)
nationality_freq = df["Reviewer_Nationality"].value_counts()

print("There are " + str(nationality_freq.size) + " different countries")
#nationality_freq.size counts the nations and gives the total number
#nationality_freq will print the first and last rows of the series.
print(nationality_freq)
print("\n ALL THE DATA")
print(nationality_freq.to_string())
#nationality_freq.to_string prints all the data


In [None]:
#.index[] will print the country at the placed index in the column
nationality_freq.index[2]

nationality_freq[1:11]#Prints the nations from index 1 to 11.


Reviewer_Nationality
 United States of America     20098
 Australia                    12836
 Ireland                       8560
 United Arab Emirates          5488
 Saudi Arabia                  5179
 Netherlands                   4774
 France                        4559
 Canada                        4523
 Switzerland                   4503
 Germany                       4071
Name: count, dtype: int64

In [None]:
for nat in nationality_freq[:10].index:
  #Extracts all rows that match the criteria into a new dataframe
   nat_df = df[df["Reviewer_Nationality"] == nat]
   #Obtain the hotel frequency
   freq = nat_df["Hotel_Name"].value_counts()

print("The most reviewed hotel for " + str(nat).strip() + " was " + str(freq.index[0]) + " with " + str(freq[0]) + " reviews.")


The most reviewed hotel for Switzerland was Strand Palace Hotel with 55 reviews.


In [40]:
# First create a new dataframe based on the old one, removing the uneeded columns
hotel_freq_df = df.drop(["Hotel_Address", "Additional_Number_of_Scoring", "Review_Date", "Average_Score", "Reviewer_Nationality", "Negative_Review", "Review_Total_Negative_Word_Counts", "Positive_Review", "Review_Total_Positive_Word_Counts", "Total_Number_of_Reviews_Reviewer_Has_Given", "Reviewer_Score", "Tags", "days_since_review", "lat", "lng"], axis = 1)

# Group the rows by Hotel_Name, count them and put the result in a new column Total_Reviews_Found
hotel_freq_df['Total_Reviews_Found'] = hotel_freq_df.groupby('Hotel_Name').transform('count')

# Get rid of all the duplicated rows
hotel_freq_df = hotel_freq_df.drop_duplicates(subset = ["Hotel_Name"])
display(hotel_freq_df)

Unnamed: 0,Hotel_Name,Total_Number_of_Reviews,Total_Reviews_Found
0,Hotel Arena,1403,405
405,K K Hotel George,1831,566
971,Apex Temple Court Hotel,2619,1037
2008,The Park Grand London Paddington,4380,1770
3778,Monhotel Lounge SPA,171,35
...,...,...,...
308683,Swiss tel Amsterdam,2756,830
309513,City Rooms,414,44
309557,Holiday Inn Amsterdam,1485,286
309843,Jaz Amsterdam,7108,1157


In [41]:
def get_difference_review_avg(row):
  return row["Average_Score"] - row["Calc_Average_Score"]

#Creates Calc_Average_Score column
df['Calc_Average_Score'] = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)

# Add a new column with the difference between the two average scores
df["Average_Score_Difference"] = df.apply(get_difference_review_avg, axis = 1)

# Create a df without all the duplicates of Hotel_Name (so only 1 row per hotel)
review_scores_df = df.drop_duplicates(subset = ["Hotel_Name"])


# Sort the dataframe to find the lowest and highest average score difference
review_scores_df = review_scores_df.sort_values(by=["Average_Score_Difference"])


display(review_scores_df[["Average_Score_Difference", "Average_Score", "Calc_Average_Score", "Hotel_Name"]])


Unnamed: 0,Average_Score_Difference,Average_Score,Calc_Average_Score,Hotel_Name
43688,-0.7,7.5,8.2,Mercure Paris Porte d Orleans
178253,-0.7,7.9,8.6,Renaissance Paris Vendome Hotel
111027,-0.7,8.8,9.5,Hotel Stendhal Place Vend me Paris MGallery by...
218258,-0.5,7.0,7.5,Hotel Royal Elys es
71274,-0.5,8.9,9.4,Drawing Hotel
...,...,...,...,...
201776,0.7,7.5,6.8,Mercure Paris Op ra Faubourg Montmartre
22189,0.8,7.1,6.3,Holiday Inn Paris Montparnasse Pasteur
250308,0.9,8.6,7.7,MARQUIS Faubourg St Honor Relais Ch teaux
68936,0.9,6.8,5.9,Villa Eugenie


In [43]:
#Counting using lambda
start = time.time()
no_negative_reviews = df.apply(lambda x: True if x['Negative_Review'] == "No Negative" else False , axis=1)
print("Number of No Negative reviews: " + str(len(no_negative_reviews[no_negative_reviews == True].index)))

no_positive_reviews = df.apply(lambda x: True if x['Positive_Review'] == "No Positive" else False , axis=1)
print("Number of No Positive reviews: " + str(len(no_positive_reviews[no_positive_reviews == True].index)))

both_no_reviews = df.apply(lambda x: True if x['Negative_Review'] == "No Negative" and x['Positive_Review'] == "No Positive" else False , axis=1)
print("Number of both No Negative and No Positive reviews: " + str(len(both_no_reviews[both_no_reviews == True].index)))
end = time.time()
print("Lambdas took " + str(round(end - start, 2)) + " seconds")


Number of No Negative reviews: 75924
Number of No Positive reviews: 22043
Number of both No Negative and No Positive reviews: 78
Lambdas took 6.05 seconds


In [44]:
# without lambdas (using a mixture of notations to show you can use both)
start = time.time()
no_negative_reviews = sum(df.Negative_Review == "No Negative")
print("Number of No Negative reviews: " + str(no_negative_reviews))

no_positive_reviews = sum(df["Positive_Review"] == "No Positive")
print("Number of No Positive reviews: " + str(no_positive_reviews))

both_no_reviews = sum((df.Negative_Review == "No Negative") & (df.Positive_Review == "No Positive"))
print("Number of both No Negative and No Positive reviews: " + str(both_no_reviews))

end = time.time()
print("Sum took " + str(round(end - start, 2)) + " seconds")


Number of No Negative reviews: 75924
Number of No Positive reviews: 22043
Number of both No Negative and No Positive reviews: 78
Sum took 0.37 seconds


Ayt. Now being done with this data analysis and performing operations on the data I just need to understand exactly what i have done and how I have done it.
Because this is essentially copied and pasted. Though  I have obviously read through the lesson. I am just forced to use the dataset given because I can't really get another one
