In [None]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
# Load English model
nlp = spacy.load("en_core_web_sm")


In [None]:
file_path = "/content/instagram.csv"  # Replace with the correct file path
df = pd.read_csv(file_path)

# Display first few rows of the
df.head()


Unnamed: 0,review_description,rating,review_date
0,"The app is good for connecting with friends, f...",3,2023-07-11 23:57:07
1,"Used to be my favorite social media app, but ""...",2,2023-07-22 21:37:09
2,Instagram is the best of all the social media....,5,2023-07-25 03:24:58
3,"I love this app.. but as of late, I have been ...",2,2023-07-09 04:49:57
4,Used to be a great app but there are so many m...,3,2023-07-17 16:47:04


In [None]:
df.shape

(210542, 3)

In [None]:
df_data = df.head(1000)

In [None]:
df_data.head()

Unnamed: 0,review_description,rating,review_date
0,"The app is good for connecting with friends, f...",3,2023-07-11 23:57:07
1,"Used to be my favorite social media app, but ""...",2,2023-07-22 21:37:09
2,Instagram is the best of all the social media....,5,2023-07-25 03:24:58
3,"I love this app.. but as of late, I have been ...",2,2023-07-09 04:49:57
4,Used to be a great app but there are so many m...,3,2023-07-17 16:47:04


In [None]:
df_data.shape

(1000, 3)

In [None]:
# Function to preprocess text using SpaCy
def preprocess_text(text):
    doc = nlp(str(text))  # Process text with SpaCy
    clean_tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(clean_tokens)


In [None]:
# Apply preprocessing on the 'review_description' column
df_data['processed_review'] = df_data['review_description'].apply(preprocess_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data['processed_review'] = df_data['review_description'].apply(preprocess_text)


In [None]:
df_data.head(5)

Unnamed: 0,review_description,rating,review_date,processed_review
0,"The app is good for connecting with friends, f...",3,2023-07-11 23:57:07,app good connect friend family potential busin...
1,"Used to be my favorite social media app, but ""...",2,2023-07-22 21:37:09,favorite social medium app improvement hard ha...
2,Instagram is the best of all the social media....,5,2023-07-25 03:24:58,Instagram good social medium IG post platform ...
3,"I love this app.. but as of late, I have been ...",2,2023-07-09 04:49:57,love app late have problem screen background t...
4,Used to be a great app but there are so many m...,3,2023-07-17 16:47:04,great app bug issue plus ruin user interface t...


In [None]:
# Display the processed reviews
print("\nProcessed Reviews:\n", df_data[['review_description', 'processed_review']].head())



Processed Reviews:
                                   review_description  \
0  The app is good for connecting with friends, f...   
1  Used to be my favorite social media app, but "...   
2  Instagram is the best of all the social media....   
3  I love this app.. but as of late, I have been ...   
4  Used to be a great app but there are so many m...   

                                    processed_review  
0  app good connect friend family potential busin...  
1  favorite social medium app improvement hard ha...  
2  Instagram good social medium IG post platform ...  
3  love app late have problem screen background t...  
4  great app bug issue plus ruin user interface t...  


In [None]:
# Compute TF-IDF Representation
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df_data['processed_review'])


In [None]:
len(vectorizer.get_feature_names_out())

2706

In [None]:
# Convert to DataFrame for better readability
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
df_tfidf.head(100)

Unnamed: 0,aacount,aane,abandon,ability,able,abruptly,absent,absolute,absolutely,absurd,...,yes,yesterday,yo,young,youtube,yt,zero,zoom,zuck,zuckerberg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
import spacy
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer  # For Stemming
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Load English model
nlp = spacy.load("en_core_web_sm")

# File path for CSV
file_path = "/content/instagram.csv"  # Replace with the correct file path
df = pd.read_csv(file_path)

# Display first few rows of the data
df.head()
df.shape,
df = df.head(1000)
df.head(1000).shape

# Function to preprocess text using SpaCy and additional preprocessing steps
def preprocess_text(text):
    # Tokenization, POS Tagging, Stop word removal, Lemmatization, and Stemming
    doc = nlp(str(text))  # Process text with SpaCy
    ps = PorterStemmer()  # Initialize the stemmer

    # Tokenization and POS tagging with filtering of stop words and non-alphabetical words
    clean_tokens = []
    for token in doc:
        # Remove stop words, non-alphabetical tokens, and apply POS tagging
        if not token.is_stop and token.is_alpha:
            lemma = token.lemma_  # Lemmatization
            stemmed = ps.stem(lemma)  # Stemming
            clean_tokens.append(stemmed)

    # Join the cleaned tokens back into a string
    return " ".join(clean_tokens)

# Apply preprocessing on the 'review_description' column
df_data['processed_review'] = df_data['review_description'].apply(preprocess_text)

# Display the first few processed reviews
print("\nProcessed Reviews:\n", df_data[['review_description', 'processed_review']].head())

# Compute TF-IDF Representation
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df_data['processed_review'])

# Convert to DataFrame for better readability
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Display the first 100 columns of the TF-IDF matrix
print("\nTF-IDF Matrix:\n", df_tfidf.head(100))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.



Processed Reviews:
                                   review_description  \
0  The app is good for connecting with friends, f...   
1  Used to be my favorite social media app, but "...   
2  Instagram is the best of all the social media....   
3  I love this app.. but as of late, I have been ...   
4  Used to be a great app but there are so many m...   

                                    processed_review  
0  app good connect friend famili potenti busi pa...  
1  favorit social medium app improv hard hard use...  
2  instagram good social medium ig post platform ...  
3  love app late have problem screen background t...  
4  great app bug issu plu ruin user interfac tri ...  

TF-IDF Matrix:
     aacount  aan  abandon  abil  abl  abruptli  absent  absolut  absurd  abt  \
0       0.0  0.0      0.0   0.0  0.0       0.0     0.0      0.0     0.0  0.0   
1       0.0  0.0      0.0   0.0  0.0       0.0     0.0      0.0     0.0  0.0   
2       0.0  0.0      0.0   0.0  0.0       0.0     0.0 

In [2]:
import nltk
nltk.download('punkt_tab')  # downloading punkt (only once)

from nltk.tokenize import word_tokenize  # importing word_tokenize

sText = "Text analytics is an exciting field. It helps in analyzing large amounts of data efficiently."

tokens = word_tokenize(sText)  # tokenizing your sentence
print(tokens)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['Text', 'analytics', 'is', 'an', 'exciting', 'field', '.', 'It', 'helps', 'in', 'analyzing', 'large', 'amounts', 'of', 'data', 'efficiently', '.']


POS Tagging (Part of Speech Tagging)

In [7]:
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [8]:
from nltk.tag import pos_tag
pos_tags = nltk.pos_tag(tokens)
pos_tags

[('Text', 'NN'),
 ('analytics', 'NNS'),
 ('is', 'VBZ'),
 ('an', 'DT'),
 ('exciting', 'JJ'),
 ('field', 'NN'),
 ('.', '.'),
 ('It', 'PRP'),
 ('helps', 'VBZ'),
 ('in', 'IN'),
 ('analyzing', 'VBG'),
 ('large', 'JJ'),
 ('amounts', 'NNS'),
 ('of', 'IN'),
 ('data', 'NNS'),
 ('efficiently', 'RB'),
 ('.', '.')]

In [15]:
nltk.download("stopwords")
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
newToken = []
for token in tokens:
  if token not in stop_words:
    newToken.append(token)

In [25]:
newToken

['Text',
 'analytics',
 'exciting',
 'field',
 '.',
 'It',
 'helps',
 'analyzing',
 'large',
 'amounts',
 'data',
 'efficiently',
 '.']

In [28]:
from nltk.stem import PorterStemmer

In [32]:
newTokenStem = []
stemmer = PorterStemmer()

for token in newToken:
  newTokenStem.append(stemmer.stem(token))

In [33]:
newTokenStem

['text',
 'analyt',
 'excit',
 'field',
 '.',
 'it',
 'help',
 'analyz',
 'larg',
 'amount',
 'data',
 'effici',
 '.']

5. Lemmatization

In [35]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [37]:
from nltk.stem import WordNetLemmatizer
Lammatizer = WordNetLemmatizer()

In [38]:
newTokenLem = []
for token in newToken:
  newTokenLem.append(Lammatizer.lemmatize(token))

In [39]:
newTokenLem

['Text',
 'analytics',
 'exciting',
 'field',
 '.',
 'It',
 'help',
 'analyzing',
 'large',
 'amount',
 'data',
 'efficiently',
 '.']

In [40]:
# Converting token in word back for tf-idf
sTextData = ''
for token in newTokenLem:
  sTextData = sTextData + ' ' + token

In [41]:
sTextData

' Text analytics exciting field . It help analyzing large amount data efficiently .'

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [47]:
vecotorizer = TfidfVectorizer()

In [52]:
oTFIDFText = vecotorizer.fit_transform([sTextData])

In [53]:
oTFIDFText

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 11 stored elements and shape (1, 11)>

In [55]:
import pandas as pd

In [61]:
tfidf = pd.DataFrame(oTFIDFText.toarray(),columns=vecotorizer.get_feature_names_out())

In [62]:
tfidf

Unnamed: 0,amount,analytics,analyzing,data,efficiently,exciting,field,help,it,large,text
0,0.301511,0.301511,0.301511,0.301511,0.301511,0.301511,0.301511,0.301511,0.301511,0.301511,0.301511
