# **Import Necessary Libraries**





In [1]:
import pandas as pd
import spacy
from textblob import TextBlob
from spacy.lang.en.stop_words import STOP_WORDS
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import string
! pip install vaderSentiment

# **Loading Dataset**

In [2]:
df = pd.read_csv('Evaluation-dataset.csv', header=None)

column_names = [f'col{i}' for i in range(1, 16)]
df.columns = column_names

print(df.head())
print(df.columns)

                                                col1  \
0  Tires where delivered to the garage of my choi...   
1  Easy Tyre Selection Process, Competitive Prici...   
2         Very easy to use and good value for money.   
3              Really easy and convenient to arrange   
4  It was so easy to select tyre sizes and arrang...   

                       col2                      col3  \
0   garage service positive  ease of booking positive   
1   garage service positive  value for money positive   
2  value for money positive                       NaN   
3  ease of booking positive                       NaN   
4         location positive  value for money positive   

                       col4 col5 col6 col7 col8 col9 col10 col11 col12 col13  \
0                       NaN  NaN  NaN  NaN  NaN  NaN   NaN   NaN   NaN   NaN   
1                       NaN  NaN  NaN  NaN  NaN  NaN   NaN   NaN   NaN   NaN   
2                       NaN  NaN  NaN  NaN  NaN  NaN   NaN   NaN   NaN   NaN   


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10132 entries, 0 to 10131
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   col1    10132 non-null  object
 1   col2    8129 non-null   object
 2   col3    4239 non-null   object
 3   col4    1538 non-null   object
 4   col5    445 non-null    object
 5   col6    128 non-null    object
 6   col7    36 non-null     object
 7   col8    13 non-null     object
 8   col9    8 non-null      object
 9   col10   4 non-null      object
 10  col11   1 non-null      object
 11  col12   1 non-null      object
 12  col13   1 non-null      object
 13  col14   1 non-null      object
 14  col15   1 non-null      object
dtypes: object(15)
memory usage: 1.2+ MB


In [4]:
pd.set_option('display.max_colwidth', None)
df['col1'].head()

0    Tires where delivered to the garage of my choice,the garage notified me when they had been delivered. A day and time was arranged with the garage and I went and had them fitted,a Hassel free experience.
1                                                                                                                                Easy Tyre Selection Process, Competitive Pricing and Excellent Fitting Service
2                                                                                                                                                                    Very easy to use and good value for money.
3                                                                                                                                                                         Really easy and convenient to arrange
4                                                                                                                    It was so easy to select tyre sizes and arrange loc

In [5]:
df.rename(columns={'col1': 'review_text'},  inplace=True)

In [6]:
df.columns

Index(['review_text', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8',
       'col9', 'col10', 'col11', 'col12', 'col13', 'col14', 'col15'],
      dtype='object')

# **Preprocessing Text Data**


In [7]:
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in STOP_WORDS]
    return ' '.join(tokens)

df['review_text'] = df['review_text'].apply(preprocess_text)

print(df['review_text'].head())

0    tire deliver garage choicethe garage notify deliver day time arrange garage go fitteda hassel free experience
1                                        easy tyre selection process competitive pricing excellent fitting service
2                                                                                        easy use good value money
3                                                                                          easy convenient arrange
4                                                        easy select tyre size arrange local fit price competitive
Name: review_text, dtype: object


**Loading spaCy Model**: The code initializes an instance of the English language model provided by spaCy (en_core_web_sm).

**Text Preprocessing:**  
-  The preprocess_text function is defined to preprocess the input text.
- It converts the text to lowercase using text.lower().
- It removes punctuation from the text using text.translate(str.maketrans('', '', string.punctuation)).
- It processes the text with spaCy's NLP pipeline to tokenize and lemmatize the text.
- It filters out stop words (common words that usually don't carry much meaning) from the tokens.

**Applying Preprocessing to DataFrame**: The preprocess_text function is applied to each element in the 'review_text' column of the DataFrame df.

**Printing Preprocessed Data**: The preprocessed text data from the 'review_text' column is printed using print(df['review_text'].head()), displaying the first few entries after preprocessing.









# **Identifying Subthemes in Text Data**

In [8]:
def identify_subthemes(text):
    doc = nlp(text)
    subthemes = set()
    for ent in doc.ents:
        subthemes.add(ent.text)
    for chunk in doc.noun_chunks:
        subthemes.add(chunk.text)
    return list(subthemes)

df['identified_subthemes'] = df['review_text'].apply(identify_subthemes)

print(df[['review_text', 'identified_subthemes']].head())


                                                                                                     review_text  \
0  tire deliver garage choicethe garage notify deliver day time arrange garage go fitteda hassel free experience   
1                                      easy tyre selection process competitive pricing excellent fitting service   
2                                                                                      easy use good value money   
3                                                                                        easy convenient arrange   
4                                                      easy select tyre size arrange local fit price competitive   

                                                                                            identified_subthemes  
0  [tire deliver garage choicethe garage notify deliver day time arrange garage, fitteda hassel free experience]  
1                                                                     [co

**Defining Subtheme Identification Function**:

- The identify_subthemes function is defined to identify subthemes within the input text.
- It processes the text using spaCy's NLP pipeline (nlp).
- It iterates through the entities (ent) and noun chunks (chunk) in the processed document (doc).
- For each entity and noun chunk encountered, it adds the text to the set of subthemes.
- Finally, it converts the set of subthemes into a list and returns it.

**Applying Subtheme Identification to DataFrame**:

- The identify_subthemes function is applied to each element in the 'review_text' column of the DataFrame df.
- The identified subthemes are stored in a new column named 'identified_subthemes' in the DataFrame.

**Printing Preprocessed Data with Identified Subthemes**:

- The DataFrame columns 'review_text' and 'identified_subthemes' are printed using print(df[['review_text', 'identified_subthemes']].head()), displaying the preprocessed text data along with the identified subthemes for the first few entries.








# **Analyzing Sentiment using VADER**

In [11]:
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment_vader(text):
    vs = analyzer.polarity_scores(text)
    if vs['compound'] > 0.05:
        return 'positive'
    elif vs['compound'] < -0.05:
        return 'negative'
    else:
        return 'neutral'

def get_subtheme_sentiments(subthemes):
    return [(subtheme, analyze_sentiment_vader(subtheme)) for subtheme in subthemes]

df['subtheme_sentiments_pred'] = df['identified_subthemes'].apply(get_subtheme_sentiments)

print(df[['review_text', 'subtheme_sentiments_pred']].head())



                                                                                                     review_text  \
0  tire deliver garage choicethe garage notify deliver day time arrange garage go fitteda hassel free experience   
1                                      easy tyre selection process competitive pricing excellent fitting service   
2                                                                                      easy use good value money   
3                                                                                        easy convenient arrange   
4                                                      easy select tyre size arrange local fit price competitive   

                                                                                                               subtheme_sentiments_pred  
0  [(tire deliver garage choicethe garage notify deliver day time arrange garage, neutral), (fitteda hassel free experience, positive)]  
1                          

**Initializing Sentiment Analyzer**:
- The SentimentIntensityAnalyzer from the nltk.sentiment.vader module is initialized as analyzer.

**Defining Sentiment Analysis Function**:

- The analyze_sentiment_vader function is defined to analyze the sentiment of the input text using VADER.
- It calculates the polarity scores for the text using analyzer.polarity_scores(text).
- Based on the compound score (vs['compound']), it classifies the sentiment as 'positive', 'negative', or 'neutral'.

**Defining Subtheme Sentiment Analysis Function**:

- The get_subtheme_sentiments function is defined to analyze the sentiment of a list of subthemes.
- It applies the analyze_sentiment_vader function to each subtheme in the input list and returns a list of tuples containing the subtheme and its corresponding sentiment.

**Applying Subtheme Sentiment Analysis to DataFrame**:

- The get_subtheme_sentiments function is applied to each element in the 'identified_subthemes' column of the DataFrame df.
- The subthemes along with their predicted sentiments are stored in a new column named 'subtheme_sentiments_pred' in the DataFrame.

**Printing Preprocessed Data with Subtheme Sentiments**:

- The DataFrame columns 'review_text' and 'subtheme_sentiments_pred' printed print(df[['review_text','subtheme_sentiments_pred']].head()), displaying the preprocessed text data along with the predicted sentiments for the identified subthemes for the first few entries.









In [12]:
df['review_text'][1]

'easy tyre selection process competitive pricing excellent fitting service'

In [14]:
df['subtheme_sentiments_pred'][1]

[('competitive pricing', 'positive'), ('easy tyre selection', 'positive')]

In [17]:
df['review_text'][5]

'service excellent slight downside know exact time garage garage quick not delay'

In [18]:
df['subtheme_sentiments_pred'][5]

[('service excellent slight downside', 'positive'),
 ('exact time garage garage', 'neutral')]

In [19]:
df['review_text'][2]

'easy use good value money'

In [21]:
df['subtheme_sentiments_pred'][2]

[('good value money', 'positive')]

In [22]:
data=df[['review_text', 'subtheme_sentiments_pred']]

In [23]:
data

Unnamed: 0,review_text,subtheme_sentiments_pred
0,tire deliver garage choicethe garage notify deliver day time arrange garage go fitteda hassel free experience,"[(tire deliver garage choicethe garage notify deliver day time arrange garage, neutral), (fitteda hassel free experience, positive)]"
1,easy tyre selection process competitive pricing excellent fitting service,"[(competitive pricing, positive), (easy tyre selection, positive)]"
2,easy use good value money,"[(good value money, positive)]"
3,easy convenient arrange,"[(easy convenient arrange, positive)]"
4,easy select tyre size arrange local fit price competitive,"[(easy select tyre size arrange local fit price, positive)]"
...,...,...
10127,order wrong tyre redact arrange collect supply correct tyre 2 day whilst refund difference straight away excellent service,"[(correct tyre, neutral), (collect supply, neutral), (excellent service, positive), (2 day, neutral), (order wrong tyre redact arrange, negative), (refund difference, neutral)]"
10128,good experience time redact harborne tyre efficient contact let know fit tyre overall good experience,"[(good experience time redact harborne tyre efficient contact, positive)]"
10129,order tyre need line book specify time local garage tyre fit work time use redact good price tyre quick search online,"[(order tyre need line book, neutral), (good price tyre, positive), (order tyre, neutral)]"
10130,excellent service point order fit complaint thank,"[(excellent service point order fit complaint thank, positive)]"


In [26]:
file_path = 'output_dataset.csv'
data.to_csv(file_path, index=False)