In [None]:
import pandas as pd

# Data Precessing

In [None]:
#DATA PRECESSING
ds = r'C:\Users\personal\OneDrive - University of Bolton\Desktop\new data.csv'
df = pd.read_csv(ds, encoding='latin-1')
print(df.head())

In [None]:
#Removing missing values
df.isna().sum()
df=df.dropna()

In [None]:
#checking for duplicates
from collections import Counter
counter = Counter(df)
has_duplicates = any(count > 1 for count in counter.values())
print(has_duplicates)

In [None]:
# Data cleanig
import re
import string
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
round1 = lambda x: clean_text_round1(x)

In [None]:
df = df.astype(str)

In [None]:
df['content'] = df['content'].map(round1)

In [None]:
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [None]:
df['content'] = df['content'].map(round2)

In [None]:
# Removing stop words, tokenization, pos-tagging and lemmatization 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stop_words]
    pos_tags = pos_tag(tokens)
    lemmatized_tokens = []
    for token, tag in pos_tags:
        pos_tag_simple = tag[0].lower() 
        pos_tag_simple = pos_tag_simple if pos_tag_simple in ['a', 'r', 'n', 'v'] else None
        if pos_tag_simple:
            lemmatized_tokens.append(lemmatizer.lemmatize(token, pos=pos_tag_simple))
        else:
            lemmatized_tokens.append(token)
    return ' '.join(lemmatized_tokens)
df['content'] = df['content'].apply(preprocess_text)

In [None]:
#removing unwanted words
words_to_remove = ["thing","get","good","could", "don t", "sainsbury s", "tesco", "s", "ve", "may", "m", "won t" "would","week","iceland","open",
                   "people","give","sainsbury","sainsbury s","name","today","shopping","won t", "wasnt","don t","though","especially", "£","superfood", 
                   "well", "always", "waitrose", "definitely", "shop","buy","day", "take", "item", "like","morrison","coop","co","op","jan",
                   "dec","cd","asda","aswell","poundland", "wont", "ha", "wa", "really","find", "say","said", "didnt", "dont", "told", "went", 
                   "put", "asked","one","ive","u", "tried","tell","come","using","around","keep","even","someone","seem","trying","still","that",
                   "morrison","aldi", "tesco","sainsbury","lidl","asda","abel","cole" , "either","right","arrived","lady","large","look","look","looking",
                   "due","least","nothing","till","every","use","ask","bit","go","going","working","please","id","came","gone","two","actually","getting",
                   "im","end","given","away","yet","another","left","happened","couldnt","given","think","cant", "gave","reply","able", "instead","etc",
                   "later","help","man","see","quite","go","absolutely","arrived","done","everything","contact","last","first","thought","despite","back",
                   "sometime","never","ok","see","make","almost","sainsbury","le","let","star","seems","got","given","started","reason","awful","company", 
                   "need","without","know","found","Lidls", "sainsburys", "supermarket", "store"]
remove_words = lambda x: ' '.join([word for word in x.split() if word.lower() not in words_to_remove])
df['content'] = df['content'].apply(remove_words)

# Word frequency analysis

In [None]:
# Wordcloud for all customer reviews
from wordcloud import WordCloud
import matplotlib.pyplot as plt
all_text = ' '.join(df['content'])
wordcloud = WordCloud(width=800, height=400, background_color='brown').generate(all_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud for Customer Reviews')
plt.axis('off')
plt.show()

In [None]:
# Bar chart for all customer reviews
all_text = ' '.join(df['content'])
word_list = all_text.split()
word_freq = pd.Series(word_list).value_counts()
plt.figure(figsize=(12, 8))
plt.bar(word_freq.index[:15], word_freq.values[:15])
plt.title('Top 10 Word Frequencies Across All Customer Reviews')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.show()

In [None]:
#Bar charts for top words in all selected supermarkets
selected_supermarkets = ['Aldi', 'Lidl', 'Asda', 'Tesco', 'Morrison','Sainsbury', "AbelnCole"]
def generate_bar_chart(supermarket):
    supermarket_data = df[df['Supermarkets'] == supermarket]
    text_data = ' '.join(supermarket_data['content'])
    word_list = text_data.split()
    word_freq = pd.Series(word_list).value_counts()
    plt.figure(figsize=(12, 8))
    plt.bar(word_freq.index[:10], word_freq.values[:10])
    plt.title(f'Top 10 Word Frequencies for {supermarket.capitalize()}')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
for supermarket in selected_supermarkets:
    generate_bar_chart(supermarket)

In [None]:
## Compound Bar charts across the supermarkets
selected_supermarkets = ['Aldi', 'Lidl', 'Asda', 'Tesco', 'Morrison','Sainsbury']
superstore_texts = df[df['Supermarkets'].isin(selected_supermarkets)].groupby('Supermarkets')['content'].apply(lambda x: ' '.join(x)).to_frame()
superstore_texts['word_list'] = superstore_texts['content'].apply(lambda x: x.split())
word_freq_dict = {}
for superstore, word_list in superstore_texts[['word_list']].iterrows():
    word_freq = pd.Series(word_list['word_list']).value_counts()
    word_freq_dict[superstore] = word_freq
plt.figure(figsize=(12, 8))

for superstore, word_freq in word_freq_dict.items():
    plt.bar(word_freq.index[:10], word_freq.values[:10], label=superstore)
plt.title('Top 10 Word Frequencies for Each Superstore')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.xticks(rotation=45)
plt.legend()
plt.show()

In [None]:
#bar chart for relevant adjectives across the supermarkets 
words_to_count = ["quick", "fast", 'quality', 'rude', 'helpful', 'friendly', "polite", 'cheap', "expensive", "price", "discount", "voucher"]
selected_supermarkets = ['Aldi', 'Sainsbury', 'Morrison', 'Lidl', 'Asda', 'AbelnCole', 'Tesco']
df['content_lower'] = df['content'].str.lower()
def count_words(text, word):
    return text.count(word)
word_counts = {}
df_selected = df[df['Supermarkets'].isin(selected_supermarkets)]
for word in words_to_count:
    df_selected[word] = df_selected['content_lower'].apply(count_words, word=word)
    word_counts[word] = df_selected.groupby('Supermarkets')[word].sum()
 for each word
plt.figure(figsize=(15, 10))

for i, word in enumerate(words_to_count, start=1):
    plt.subplot(4, 4, i)
    plt.bar(word_counts[word].index, word_counts[word].values)
    plt.title( f'Word: {word}', fontsize=16)
    plt.xlabel('Supermarket', fontsize=15)
    plt.ylabel('Word Count', fontsize=14)
    plt.xticks(rotation=45)
    for tick in plt.gca().get_xticklabels():
        if tick.get_text() in selected_supermarkets:
            tick.set_fontsize(13)  
plt.tight_layout()
plt.show()
word_counts_df = pd.DataFrame(word_counts)
print(word_counts_df)


In [None]:
# Bar chart of predefined topics
content = df['content']
groups = {
    'Delivery service': ['deliver', 'driver', 'fast', 'wait', 'time', 'collect', 'cancel'],
    'Customer service': ['customer', 'staff', 'manager', 'easy', 'rude', 'helpful', 'friendly', 'polite'],
    'Online experience': ['email', 'order', 'call', 'phone', 'online'],
    'Product quality': ['food', 'product', 'organic', 'quality', 'brand', 'stock', 'rotten', 'veg', 'fruit', 'fresh', 'produce', 'vegetable'],
    'Payment process': ['queue', 'checkout', 'self', 'receipt', 'card', 'pay', 'refund'],
    'Price': ['price', 'offer', 'discount', 'voucher', 'money', 'value', 'cheap', 'expensive']
    
}
group_frequencies = {group: sum(content.str.contains('|'.join(words), case=False)) for group, words in groups.items()}
plt.figure(figsize=(8, 8))
plt.pie(group_frequencies.values(), labels=group_frequencies.keys(), autopct='%1.1f%%', startangle=140)
plt.title('Frequency of Pre-defined topics in Customer Review')
plt.axis('equal')  
plt.show()

In [None]:
# Correlation matrix of selected supermarkets
import seaborn as sns
selected_supermarkets = ['Aldi', 'Sainsbury', 'Morrison', 'Lidl', 'Asda', 'AbelnCole', 'Tesco']
df['content_lower'] = df['content'].str.lower()
words_to_count = ['delivery',"quick", "fast",  'order', 'quality', 'service', 'customer', 'rude', 'helpful', 'friendly', "polite",'price', 'cheap',"expensive", "voucher", "discount"]
word_counts_df = pd.DataFrame(index=words_to_count, columns=selected_supermarkets)
def count_words(text, word):
    return text.count(word)
 supermarket and word
for supermarket in selected_supermarkets:
    df_supermarket = df[df['Supermarkets'] == supermarket]
    for word in words_to_count:
        count = df_supermarket['content_lower'].apply(count_words, word=word).sum()
        word_counts_df.at[word, supermarket] = count
word_counts_df = word_counts_df.apply(pd.to_numeric)
correlation_matrix = word_counts_df.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Word Counts for Selected Supermarkets')
plt.show()

In [None]:
print(correlation_matrix)

# Topic modelling

In [None]:
#Generating topics with topic modelling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

tfidf_vectorizer = TfidfVectorizer(max_features=5000) 
tfidf_matrix = tfidf_vectorizer.fit_transform(df['content'])
lda = LatentDirichletAllocation(n_components=10, random_state=42) 
lda.fit(tfidf_matrix)
def display_topics(model, feature_names, no_top_words):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
        topics["Topic {}".format(topic_idx + 1)] = ' '.join(top_words)
    return pd.DataFrame(topics, index=['Top Words'])
no_top_words = 10 
pd.set_option('display.max_colwidth', None)
topics_df = display_topics(lda, tfidf_vectorizer.get_feature_names_out(), no_top_words)
print("Topics generated by LDA:")
print(topics_df)

# Sentiment analysis

In [None]:
# Sentiment analysis 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
df['sentiment_scores'] = df['content'].apply(lambda x: sid.polarity_scores(x))
df['sentiment_score'] = df['sentiment_scores'].apply(lambda x: x['compound'])
def categorize_sentiment(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'
df['sentiment_category'] = df['sentiment_score'].apply(categorize_sentiment)

In [None]:
# Bar chart of sentiment category
sentiment_counts = df['sentiment_category'].value_counts()
plt.figure(figsize=(8, 6))
sentiment_counts.plot(kind='bar', color=['green', 'red', 'blue'])
plt.title('Frequency of Sentiment Categories')
plt.xlabel('Sentiment Category')
plt.ylabel('Frequency')
plt.xticks(rotation=0)  # Rotate x-axis labels if needed
plt.show()


In [None]:
# Bar chart of top words based on the sentiment analysis
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(df['content'])
words_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
words_df['sentiment_category'] = df['sentiment_category']
words_df = words_df[words_df['sentiment_category'] != 'neutral']
sentiment_word_counts = words_df.groupby('sentiment_category').sum()
top_positive_words = sentiment_word_counts.loc['positive'].nlargest(15)
top_negative_words = sentiment_word_counts.loc['negative'].nlargest(15)
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

# Positive sentiment
axs[0].barh(top_positive_words.index[::-1], top_positive_words.values[::-1], color='green')
axs[0].set_title('Top 15 Common Words for Positive Sentiment')
axs[0].set_xlabel('Frequency')
axs[0].set_ylabel('Words')

# Negative sentiment
axs[1].barh(top_negative_words.index[::-1], top_negative_words.values[::-1], color='red')
axs[1].set_title('Top 15 Common Words for Negative Sentiment')
axs[1].set_xlabel('Frequency')
axs[1].set_ylabel('Words')

plt.tight_layout()
plt.show()

In [None]:
# Bar chart of selected sets of words based on the sentiment category across the selected supermarkets
from sklearn.feature_extraction.text import CountVectorizer
set1 = ['queue', 'checkout', 'self', 'pay', 'price', 'discount', 'voucher', 'refund', 'money', 'value']
set2 = ['delivery','driver', 'fast', 'quick','wait','bag', 'box']
set3 = ['email' 'online','call', 'phone','order', 'service','staff','manager']
set4 = ['quality', 'organic', 'product', 'food']
selected_supermarkets =  ["Aldi","Asda","Sainsbury","Morrison", "Tesco", "Lidl", "AbelnCole"]
df_filtered = Morrison[Morrison['Supermarkets'].isin(selected_supermarkets)]
def plot_bar_chart(words, title):
    df_words_filtered = df_filtered[df_filtered['content'].str.contains('|'.join(words), case=False)]
    vectorizer = CountVectorizer(vocabulary=words, lowercase=False)
    X = vectorizer.fit_transform(df_words_filtered['content'])
    words_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    words_df['sentiment_category'] = df_words_filtered['sentiment_category']
    words_df['Supermarkets'] = df_words_filtered['Supermarkets']
    sentiment_word_counts = words_df.groupby(['Supermarkets', 'sentiment_category']).sum()
    for supermarket in selected_supermarkets:
        try:
            plt.figure(figsize=(10, 6))
            
            # Positive sentiment bars
            plt.barh(words, sentiment_word_counts.loc[(supermarket, 'positive')], color='green', label='Positive')

            # Negative sentiment bars
            plt.barh(words, -sentiment_word_counts.loc[(supermarket, 'negative')], color='darkred', label='Negative')

            plt.title(f'{title} - {supermarket}')
            plt.xlabel('Frequency')
            plt.ylabel('Words')
            plt.legend()
            plt.grid(axis='x', linestyle='--', alpha=0.7)

            plt.show()
        except KeyError:
            print(f"No data available for {supermarket}")

# Plot bar charts for each set of words and each supermarket
plot_bar_chart(set1, 'Distribution of Positive and Negative Sentiment')
plot_bar_chart(set2, 'Distribution of Positive and Negative Sentiment')
plot_bar_chart(set3, 'Distribution of Positive and Negative Sentiment')
plot_bar_chart(set4, 'Distribution of Positive and Negative Sentiment')


In [None]:
# Pie chart of predefined topics based on assigned related words
content = df['content']

# Define the groups of words
groups = {
    'Delivery service': ['deliver', 'driver', 'fast', 'wait', 'time', 'collect', 'cancel'],
    'Customer service': ['customer', 'staff', 'manager', 'easy', 'rude', 'helpful', 'friendly', 'polite'],
    'Online experience': ['email', 'order', 'call', 'phone', 'online'],
    'Product quality': ['food', 'product', 'organic', 'quality', 'brand', 'stock', 'rotten', 'veg', 'fruit', 'fresh', 'produce', 'vegetable'],
    'Payment process': ['queue', 'checkout', 'self', 'receipt', 'card', 'pay', 'refund'],
    'Price': ['price', 'offer', 'discount', 'voucher', 'money', 'value', 'cheap', 'expensive']
    
}

group_frequencies = {group: sum(content.str.contains('|'.join(words), case=False)) for group, words in groups.items()}

# Plotting the pie chart
plt.figure(figsize=(8, 8))
plt.pie(group_frequencies.values(), labels=group_frequencies.keys(), autopct='%1.1f%%', startangle=140)
plt.title('Frequency of Pre-defined topics in Customer Review')
plt.axis('equal')  
plt.show()


In [None]:
#Word cloud of sentiment category
from wordcloud import WordCloud
positive_text = ' '.join(df[df['sentiment_category'] == 'positive']['content'])
neutral_text = ' '.join(df[df['sentiment_category'] == 'neutral']['content'])
negative_text = ' '.join(df[df['sentiment_category'] == 'negative']['content'])

def generate_word_cloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color ='white').generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(title)
    plt.axis('off')
    plt.show()

generate_word_cloud(positive_text, "Positive Sentiment Word Cloud")
generate_word_cloud(neutral_text, "Neutral Sentiment Word Cloud")
generate_word_cloud(negative_text, "Negative Sentiment Word Cloud")

In [None]:
selected_supermarkets = ['Aldi', 'Lidl', 'Asda', 'Tesco', 'Morrison','Sainsbury', "AbelnCole", "Co-op", "Waitrose"]
def generate_bar_chart(supermarket):
    supermarket_data = df[df['Supermarkets'] == supermarket]
    text_data = ' '.join(supermarket_data['content'])
    word_list = text_data.split()
    word_freq = pd.Series(word_list).value_counts()
    plt.figure(figsize=(12, 8))
    plt.bar(word_freq.index[:10], word_freq.values[:10])
    plt.title(f'Top 10 Word Frequencies for {supermarket.capitalize()}')
    plt.xlabel('Words')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

for supermarket in selected_supermarkets:
    generate_bar_chart(supermarket)

In [None]:
selected_supermarkets = ['Aldi', 'Lidl', 'Asda', 'Tesco', 'Morrison', 'Sainsbury', "AbelnCole", "Co-op", "Waitrose"]
def generate_word_frequency_table(supermarket):
    supermarket_data = df[df['Supermarkets'] == supermarket]
    text_data = ' '.join(supermarket_data['content'])
    word_list = text_data.split()
    word_freq = pd.Series(word_list).value_counts().head(10)
    word_freq_df = pd.DataFrame({'Word': word_freq.index, 'Frequency': word_freq.values})
    print(f"Word Frequencies for {supermarket.capitalize()}:")
    print(word_freq_df)

for supermarket in selected_supermarkets:
    generate_word_frequency_table(supermarket)

# Machine learning

In [None]:
# Training and testing of the models
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

X = df['content']
y = df['sentiment_category']
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf_vectorizer.fit_transform(X)

svm_classifier = SVC(kernel='linear')
svm_scores = cross_val_score(svm_classifier, X_tfidf, y, cv=5)  
print("Support Vector Machine (SVM) Mean Accuracy:", svm_scores.mean())

logistic_regression = LogisticRegression()
lr_scores = cross_val_score(logistic_regression, X_tfidf, y, cv=5) 
print("Logistic Regression Mean Accuracy:", lr_scores.mean())

random_forest = RandomForestClassifier()
rf_scores = cross_val_score(random_forest, X_tfidf, y, cv=5) 
print("Random Forest Mean Accuracy:", rf_scores.mean())

naive_bayes = MultinomialNB()
nb_scores = cross_val_score(naive_bayes, X_tfidf, y, cv=5) 
print("Naive Bayes Mean Accuracy:", nb_scores.mean())


In [None]:
# Confusion matrix of the models
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
def plot_confusion_matrix(y_true, y_pred, classifier_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Negative', 'Neutral', 'Positive'], yticklabels=['Negative', 'Neutral', 'Positive'])
    plt.title(f'Confusion Matrix - {classifier_name}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()
plot_confusion_matrix(y_test, y_pred_svm, 'SVM')
plot_confusion_matrix(y_test, y_pred_lr, 'Logistic Regression')
plot_confusion_matrix(y_test, y_pred_rf, 'Random Forest')
plot_confusion_matrix(y_test, y_pred_nb, 'Naive Bayes')

In [None]:
# Training and testing of hybrid model
import numpy as np
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
X = df['content']
y = df['sentiment_category']
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf_vectorizer.fit_transform(X)
X_train, X_test, y_train_labels, y_test_labels = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train, y_train_labels)
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train_labels)

svm_preds_test = svm_model.predict_proba(X_test) 
lr_preds_test = lr_model.predict_proba(X_test)

X_test_combined = np.hstack((X_test.toarray(), svm_preds_test, lr_preds_test))

hybrid_model = LogisticRegression()
hybrid_model.fit(X_test_combined, y_test_labels)

hybrid_accuracy = hybrid_model.score(X_test_combined, y_test_labels)
print("Hybrid Model Accuracy:", hybrid_accuracy)


In [None]:
from sklearn.metrics import classification_report
hybrid_preds = hybrid_model.predict(X_test_combined)
report = classification_report(y_test_labels, hybrid_preds)
print("Classification Report:")
print(report)

In [None]:
# onfusion matrix of the Hybrid model
from sklearn.metrics import confusion_matrix
hybrid_preds = hybrid_model.predict(X_test_combined)

cm = confusion_matrix(y_test_labels, hybrid_preds)
# Plot confusion matrix for hybrid model
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=hybrid_model.classes_, yticklabels=hybrid_model.classes_)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix-Hybrid Model')
plt.show()


In [None]:
from sklearn.metrics import classification_report
hybrid_preds = hybrid_model.predict(X_test_combined)
report = classification_report(y_test_labels, hybrid_preds)
print("Classification Report:")
print(report)