Bag Of Words Model

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
# Sample sentences
texts = ["I love this phone", "This movie is boring", "What an amazing game!", "This movie was so good"]

In [3]:
# Initialize CountVectorizer
vectorizer = CountVectorizer()

In [4]:
# Fits(trains) the model and transforms the texts into a matrix
X = vectorizer.fit_transform(texts)

In [5]:
# Allows us to see the feature matrix
print(X.toarray())

[[0 0 0 0 0 0 1 0 1 0 1 0 0]
 [0 0 1 0 0 1 0 1 0 0 1 0 0]
 [1 1 0 1 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 1 0 0 1 0 1 1 1 0]]


In [6]:
# And the feature names (words)
print(vectorizer.get_feature_names_out())

['amazing' 'an' 'boring' 'game' 'good' 'is' 'love' 'movie' 'phone' 'so'
 'this' 'was' 'what']


In [7]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

In [8]:
print(X.toarray())


[[0.         0.         0.         0.         0.         0.
  0.64450299 0.         0.64450299 0.         0.41137791 0.
  0.        ]
 [0.         0.         0.57457953 0.         0.         0.57457953
  0.         0.4530051  0.         0.         0.36674667 0.
  0.        ]
 [0.5        0.5        0.         0.5        0.         0.
  0.         0.         0.         0.         0.         0.
  0.5       ]
 [0.         0.         0.         0.         0.49819711 0.
  0.         0.39278432 0.         0.49819711 0.31799276 0.49819711
  0.        ]]


In [9]:
print(vectorizer.get_feature_names_out())

['amazing' 'an' 'boring' 'game' 'good' 'is' 'love' 'movie' 'phone' 'so'
 'this' 'was' 'what']


In [10]:
# Scikit-learn based sentiment analysis tool
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
import pandas as pd


#Example dataset
df = pd.read_csv('all-data.csv', header=None, encoding='ISO-8859-1', names = ['label', 'text'])
labels = df['label'].values
texts = df['text'].values


# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)  # Converts to 0, 1, 2

#Train-test split
X_train, X_test, y_train, y_test = train_test_split(texts, encoded_labels, test_size=0.001, random_state=5)

# Create a pipeline 
pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression())
pipeline.fit(X_train, y_train)

#Predict probabilities
probs = pipeline.predict_proba(X_test)
predicted_labels = pipeline.predict(X_test)


# Post-processing for Sentiment Analysis 
def get_sentiment_score(probs):
    scores = []
    for prob in probs:
        if prob.argmax() == 0:
            score = 1 + prob[0] * 2
        elif prob.argmax() == 1:
            score = 4 + prob[1] * 2
        else:
            score = 8 + prob[2] * 3
        scores.append(score)
    return scores

scores = get_sentiment_score(probs)


scores = get_sentiment_score(probs)
print("Training set:", X_train)
print("Test set:", X_test)
print("Predicted scores:", scores)
##--------ChatGPT code is below------------(Just for a look/test)

accuracy = accuracy_score(y_test, predicted_labels)
precision = precision_score(y_test, predicted_labels, average='weighted')
recall = recall_score(y_test, predicted_labels, average='weighted')
f1 = f1_score(y_test, predicted_labels, average='weighted')

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Training set: ['Residents access to the block is planned to be from Aleksandri Street .'
 '4 ) Complete name of the shareholder : Otto Henrik Bernhard Nyberg 5 ) Further information : The amount of shares now transferred corresponds to 5.68 % of the total number of shares in Aspo Plc. .'
 'Aldata noted that its Voice Supply Chain Technology approach enables VDW to integrate with warehouse management systems .'
 ...
 'Operating profit excluding restructuring costs grew to EUR 44.5 million from EUR 31.7 million while operating profit including restructuring costs showed even larger growth to EUR 38.5 million from EUR 7.4 million .'
 "Vacon 's office will support its customers in Brazil and South America ."
 'Fortum holds 90.2 pct of the share capital and 94.4 pct of the voting rights in the company , which it now plans to delist from the Warsaw Stock Exchange .']
Test set: ['The training modules aim at strengthening the public accounting abilities and fulfilling communal project manageme

In [11]:
%pip install pandas


Note: you may need to restart the kernel to use updated packages.


In [12]:
import pandas as pd

df = pd.read_csv('all-data.csv', header=None, encoding='ISO-8859-1', names = ['label', 'text'])
labels = df['label'].values
texts = df['text'].values

for index, row in df.iterrows():
    print(f"Label: {row['label']}, Text: {row['text'][:]}")

Label: neutral, Text: According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .
Label: neutral, Text: Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .
Label: negative, Text: The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .
Label: positive, Text: With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .
Label: positive, Text: According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 