In [1]:
import pandas as pd
import re
import string


In [2]:
def preprocess_text(text):
    # lowercase
    text = text.lower()
    # Removes URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Removes punctuation and special characters
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Removes non-alphanumeric characters
    text = re.sub(r'\W+', ' ', text)
    return text.strip()


In [3]:
df_labeled = pd.read_csv('C:/Users/Devanand J/Desktop/SemEval_2014.csv')
df_unlabeled = pd.read_csv('C:/Users/Devanand J/Desktop/Yelp_2015.csv')

# Applying the above preprocessing step to the respective comments column
df_labeled['comments'] = df_labeled['comments'].apply(preprocess_text)
df_unlabeled['preprocessed_comments'] = df_unlabeled['preprocessed_comments'].apply(preprocess_text)

# Save the cleaned dataset (optional)
df_labeled.to_csv('C:/Users/Devanand J/Desktop/Final/SemEval_raw.csv', index=False)
df_unlabeled.to_csv('C:/Users/Devanand J/Desktop/Final/SemEval_raw.csv', index=False)

# Displaying first few rows of the cleaned labeled and unlabeled dataset
print(df_labeled.head())
print(df_unlabeled.head())


   overall polarity                                           comments
0                 1  unfortunately the frustration of being dr gold...
1                 2  been going to dr goldberg for over 10 years i ...
2                 1  i dont know what dr goldberg was like before m...
3                 1  im writing this review to give you a heads up ...
4                 2  all the food is great here but the best thing ...


In [2]:
!pip install spacy
!pip install neuralcoref
!python -m spacy download en_core_web_sm


Collecting neuralcoref
  Downloading neuralcoref-4.0.tar.gz (368 kB)
     ---------------------------------------- 0.0/368.7 kB ? eta -:--:--
     - -------------------------------------- 10.2/368.7 kB ? eta -:--:--
     ---- -------------------------------- 41.0/368.7 kB 393.8 kB/s eta 0:00:01
     -------------------------------------  368.6/368.7 kB 2.9 MB/s eta 0:00:01
     -------------------------------------- 368.7/368.7 kB 2.6 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting boto3 (from neuralcoref)
  Downloading boto3-1.34.151-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<1.35.0,>=1.34.151 (from boto3->neuralcoref)
  Downloading botocore-1.34.151-py3-none-any.whl.metadata (5.7 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3->neuralcoref)
  Downloading s3transfer-0.10.2-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.34.151-py3-none-any.whl (139 kB)
   ------------------------

  error: subprocess-exited-with-error
  
  python setup.py bdist_wheel did not run successfully.
  exit code: 1
  
  [35 lines of output]
  C:\Users\Devanand J\anaconda3\Lib\site-packages\setuptools\__init__.py:84: _DeprecatedInstaller: setuptools.installer and fetch_build_eggs are deprecated.
  !!
  
          ********************************************************************************
          Requirements should be satisfied by a PEP 517 installer.
          If you are using pip, you can try `pip install --use-pep517`.
          ********************************************************************************
  
  !!
    dist.fetch_build_eggs(dist.setup_requires)
  running bdist_wheel
  running build
  running build_py
  creating build
  creating build\lib.win-amd64-cpython-311
  creating build\lib.win-amd64-cpython-311\neuralcoref
  copying neuralcoref\file_utils.py -> build\lib.win-amd64-cpython-311\neuralcoref
  copying neuralcoref\__init__.py -> build\lib.win-amd64-cpython-3

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 660.6 kB/s eta 0:00:20
     - -------------------------------------- 0.5/12.8 MB 4.4 MB/s eta 0:00:03
     ------ --------------------------------- 2.1/12.8 MB 13.1 MB/s eta 0:00:01
     --------- ------------------------------ 2.9/12.8 MB 15.5 MB/s eta 0:00:01
     --------------- ------------------------ 5.0/12.8 MB 21.1 MB/s eta 0:00:01
     ------------------- -------------------- 6.1/12.8 MB 21.6 MB/s eta 0:00:01
     ------------------------- -------------- 8.0/12.8 MB 24.5 MB/s eta 0:00:01
     ------------------------------ --------- 9.9/12.8 MB 26.3 MB/s eta 0:00:01
     ------------------------------------

In [11]:
import pandas as pd
import spacy
import neuralcoref


nlp = spacy.load("en_core_web_sm")

# Adding neuralcoref to the spaCy pipeline
neuralcoref.add_to_pipe(nlp)
df_labeled = pd.read_csv('C:/Users/Devanand J/Desktop/Final/SemEval_raw.csv')
df_unlabeled = pd.read_csv('C:/Users/Devanand J/Desktop/Final/Yelp_dataset_raw.csv')

# To handle the non empty string errors
df_labeled['comments'] = df_labeled['comments'].astype(str)
df_unlabeled['preprocessed_comments'] = df_unlabeled['preprocessed_comments'].astype(str)

# POS tagging, syntactic parsing, and coreference resolution
def preprocess_text_spacy(text):
    doc = nlp(text)
    tokens = [(token.text, token.pos_, token.dep_) for token in doc]
    coreferences = []
    if doc._.has_coref:
        for cluster in doc._.coref_clusters:
            main_mention = cluster.main
            for mention in cluster.mentions:
                coreferences.append((mention.text, mention.start, mention.end, main_mention.text))
    return tokens, coreferences

df_labeled['spacy_analysis'] = df_labeled['comments'].apply(preprocess_text_spacy)
df_unlabeled['spacy_analysis'] = df_unlabeled['preprocessed_comments'].apply(preprocess_text_spacy)
df_labeled.to_csv('C:/Users/Devanand J/Desktop/Final/Subset2000/SemEval.csv', index=False)
df_unlabeled.to_csv('C:/Users/Devanand J/Desktop/Final/Subset2000/Yelp_dataset.csv', index=False)

# Printing the first few rows
print("Processed Labeled Dataset:")
print(df_labeled.head().to_string())
print("\nProcessed Unlabeled Dataset:")
print(df_unlabeled.head().to_string())



Processed Labeled Dataset:
                                            comments                                      spacy_analysis
0      The food was great, but the service was slow.  ([('The', 'DET', 'det'), ('food', 'NOUN', 'nsubj'), ('was', 'AUX', 'ROOT'), ('great', 'ADJ', 'acomp'), (',', 'PUNCT', 'punct'), ('but', 'CCONJ', 'cc'), ('the', 'DET', 'det'), ('service', 'NOUN', 'nsubj'), ('was', 'AUX', 'conj'), ('slow', 'ADJ', 'acomp'), ('.', 'PUNCT', 'punct')], [('The food', 0, 2, 'The food'), ('the service', 6, 8, 'the service')])
1  I loved the ambiance and the staff was very fr...  ([('I', 'PRON', 'nsubj'), ('loved', 'VERB', 'ROOT'), ('the', 'DET', 'det'), ('ambiance', 'NOUN', 'dobj'), ('and', 'CCONJ', 'cc'), ('the', 'DET', 'det'), ('staff', 'NOUN', 'nsubj'), ('was', 'AUX', 'ROOT'), ('very', 'ADV', 'advmod'), ('friendly', 'ADJ', 'acomp'), ('.', 'PUNCT', 'punct')], [('I', 0, 1, 'I'), ('the staff', 5, 7, 'the staff')])
2  The prices are reasonable and the portions are...  ([('The', 