In [5]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.arima.model import ARIMA

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kalur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
data = pd.read_csv('training.1600000.processed.noemoticon.csv', encoding='latin', header=None)

In [7]:
data.head(5)

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [8]:
data.columns = ['polarity', 'id', 'date', 'query', 'user', 'text']

In [9]:
data = data[['polarity', 'text']]

In [10]:
data['polarity'] = data['polarity'].apply(lambda x: 1 if x == 4 else 0)

In [11]:
data.head()

Unnamed: 0,polarity,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [12]:
def clean_text(text):
    text = re.sub(r'https\S+|www\S+|https\S+',  '', text)
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.lower()

In [13]:
data['cleaned_text'] = data['text'].apply(clean_text)

In [14]:
stop_words = stopwords.words('english')
data['cleaned_text'] = data['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [15]:
vectorizer = TfidfVectorizer(max_features=1000)
x = vectorizer.fit_transform(data['cleaned_text'])
y = data['polarity']

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state = 42)

In [17]:
model = MultinomialNB()
model.fit(x_train, y_train)

In [18]:
y_pred = model.predict(x_test)

In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.75      0.74    159494
           1       0.74      0.71      0.73    160506

    accuracy                           0.73    320000
   macro avg       0.73      0.73      0.73    320000
weighted avg       0.73      0.73      0.73    320000



In [20]:
import joblib
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [21]:
tasks = pd.DataFrame({
    'task_description': ['complete project report', 'Fix server issue', 'Attend client meeting', 'Bug fix'],
    'priority' : ['high', 'medium', 'high', 'low'],
    'deadline_days' : [2, 5, 1, 4],
    'workload_hours' : [5, 8, 2, 3]
})

In [22]:
priority_map = {'low':1, 'medium': 2, 'high': 3}
tasks['priority_numeric'] = tasks['priority'].map(priority_map)

In [23]:
print(tasks)

          task_description priority  deadline_days  workload_hours  \
0  complete project report     high              2               5   
1         Fix server issue   medium              5               8   
2    Attend client meeting     high              1               2   
3                  Bug fix      low              4               3   

   priority_numeric  
0                 3  
1                 2  
2                 3  
3                 1  


In [24]:
x = tasks[['priority_numeric', 'deadline_days', 'workload_hours']]
y = tasks['workload_hours']

In [25]:
model_opt = RandomForestRegressor(n_estimators=100, random_state=42)
model_opt.fit(x, y)

In [26]:
new_task = pd.DataFrame({
    'priority_numeric' : [3],
    'deadline_days' : [2],
    'workload_hours' : [0]
})

In [27]:
predicted_hours  = model_opt.predict(new_task)
print(f'Predicted hours for the new task: {predicted_hours[0]}')

Predicted hours for the new task: 3.24


In [28]:
joblib.dump(model_opt, 'task_optimization_model.pkl')

['task_optimization_model.pkl']

In [29]:
task_data = pd.DataFrame({
    'day': pd.date_range(start='2025-01-01', periods=30, freq='D'),
    'tasks_completed' : np.random.randint(1, 5, 30)
})

In [30]:
model = ARIMA(task_data['tasks_completed'], order = (5, 1, 0))
model_fit = model.fit()

In [31]:
forecast = model_fit.forecast(steps=7)
print(f'Predicted task completions for next 7 days: {forecast}')

Predicted task completions for next 7 days: 30    2.500428
31    1.655241
32    2.452401
33    1.979915
34    2.880138
35    2.236323
36    2.610613
Name: predicted_mean, dtype: float64
